Revert "Remove gmp from AOS" This reverts commit f37c97684f0910a3f241394549392f00145ab0f7. We need gmp for SymEngine for symbolicmanipultion in C++ Change-Id: Ia13216d1715cf96944f7b4f422b7a799f921d4a4 Signed-off-by: Austin Schuh <austin.linux@gmail.com>

commit: bb1338cd84d865f1eb7a653969204a06bba8261c [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Sat Jun 15 19:31:16 2024 -0700
committer: Austin Schuh <austin.linux@gmail.com> Wed Jun 19 19:49:35 2024 -0700
tree: e291dbf975ebfaebab464c64131d63191b8b0e39
parent: e4a8c6c24f636a763b512a3f87dee2225762d817 [diff]
diff --git a/third_party/gmp/mpn/x86/README b/third_party/gmp/mpn/x86/README
new file mode 100644
index 0000000..8d7ac90
--- /dev/null
+++ b/third_party/gmp/mpn/x86/README

@@ -0,0 +1,525 @@
+Copyright 1999-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                      X86 MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 80x86 chips.
+
+
+CODE ORGANIZATION
+
+	x86               i386, generic
+	x86/i486          i486
+	x86/pentium       Intel Pentium (P5, P54)
+	x86/pentium/mmx   Intel Pentium with MMX (P55)
+	x86/p6            Intel Pentium Pro
+	x86/p6/mmx        Intel Pentium II, III
+	x86/p6/p3mmx      Intel Pentium III
+	x86/k6            \ AMD K6
+	x86/k6/mmx        /
+	x86/k6/k62mmx     AMD K6-2
+	x86/k7            \ AMD Athlon
+	x86/k7/mmx        /
+	x86/pentium4      \
+	x86/pentium4/mmx  | Intel Pentium 4
+	x86/pentium4/sse2 /
+
+
+The top-level x86 directory contains blended style code, meant to be
+reasonable on all x86s.
+
+
+
+STATUS
+
+The code is well-optimized for AMD and Intel chips, but there's nothing
+specific for Cyrix chips, nor for actual 80386 and 80486 chips.
+
+
+
+ASM FILES
+
+The x86 .asm files are BSD style assembler code, first put through m4 for
+macro processing.  The generic mpn/asm-defs.m4 is used, together with
+mpn/x86/x86-defs.m4.  See comments in those files.
+
+The code is meant for use with GNU "gas" or a system "as".  There's no
+support for assemblers that demand Intel style code.
+
+
+
+STACK FRAME
+
+m4 macros are used to define the parameters passed on the stack, and these
+act like comments on what the stack frame looks like too.  For example,
+mpn_mul_1() has the following.
+
+        defframe(PARAM_MULTIPLIER, 16)
+        defframe(PARAM_SIZE,       12)
+        defframe(PARAM_SRC,         8)
+        defframe(PARAM_DST,         4)
+
+PARAM_MULTIPLIER becomes `FRAME+16(%esp)', and the others similarly.  The
+return address is at offset 0, but there's not normally any need to access
+that.
+
+FRAME is redefined as necessary through the code so it's the number of bytes
+pushed on the stack, and hence the offsets in the parameter macros stay
+correct.  At the start of a routine FRAME should be zero.
+
+        deflit(`FRAME',0)
+	...
+	deflit(`FRAME',4)
+	...
+	deflit(`FRAME',8)
+	...
+
+Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
+FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
+and can be used instead of explicit definitions if preferred.
+defframe_pushl() is a combination FRAME_pushl() and defframe().
+
+There's generally some slackness in redefining FRAME.  If new values aren't
+going to get used then the redefinitions are omitted to keep from cluttering
+up the code.  This happens for instance at the end of a routine, where there
+might be just four pops and then a ret, so FRAME isn't getting used.
+
+Local variables and saved registers can be similarly defined, with negative
+offsets representing stack space below the initial stack pointer.  For
+example,
+
+	defframe(SAVE_ESI,   -4)
+	defframe(SAVE_EDI,   -8)
+	defframe(VAR_COUNTER,-12)
+
+	deflit(STACK_SPACE, 12)
+
+Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
+space, and that instruction must be followed by a redefinition of FRAME
+(setting it equal to STACK_SPACE) to reflect the change in %esp.
+
+Definitions for pushed registers are only put in when they're going to be
+used.  If registers are just saved and restored with pushes and pops then
+definitions aren't made.
+
+
+
+ASSEMBLER EXPRESSIONS
+
+Only addition and subtraction seem to be universally available, certainly
+that's all the Solaris 8 "as" seems to accept.  If expressions are wanted
+then m4 eval() should be used.
+
+In particular note that a "/" anywhere in a line starts a comment in Solaris
+"as", and in some configurations of gas too.
+
+	addl	$32/2, %eax           <-- wrong
+
+	addl	$eval(32/2), %eax     <-- right
+
+Binutils gas/config/tc-i386.c has a choice between "/" being a comment
+anywhere in a line, or only at the start.  FreeBSD patches 2.9.1 to select
+the latter, and from 2.9.5 it's the default for GNU/Linux too.
+
+
+
+ASSEMBLER COMMENTS
+
+Solaris "as" doesn't support "#" commenting, using /* */ instead.  For that
+reason "C" commenting is used (see asm-defs.m4) and the intermediate ".s"
+files have no comments.
+
+Any comments before include(`../config.m4') must use m4 "dnl", since it's
+only after the include that "C" is available.  By convention "dnl" is also
+used for comments about m4 macros.
+
+
+
+TEMPORARY LABELS
+
+Temporary numbered labels like "1:" used as "1f" or "1b" are available in
+"gas" and Solaris "as", but not in SCO "as".  Normal L() labels should be
+used instead, possibly with a counter to make them unique, see jadcl0() in
+x86-defs.m4 for instance.  A separate counter for each macro makes it
+possible to nest them, for instance movl_text_address() can be used within
+an ASSERT().
+
+"1:" etc must be avoided in gcc __asm__ blocks too.  "%=" for generating a
+unique number looks like a good alternative, but is that actually a
+documented feature?  In any case this problem doesn't currently arise.
+
+
+
+ZERO DISPLACEMENTS
+
+In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
+displacement are wanted, rather than (%ebx) with no displacement.  These are
+either for computed jumps or to get desirable code alignment.  Explicit
+.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
+(%ebx).  The Zdisp() macro in x86-defs.m4 is used for this.
+
+Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
+1.92.3 changes it.  In general changing would be the sort of "optimization"
+an assembler might perform, hence explicit ".byte"s are used where
+necessary.
+
+
+
+SHLD/SHRD INSTRUCTIONS
+
+The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
+must be written "shldl %eax,%ebx" for some assemblers.  gas takes either,
+Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
+gas), and omits %cl elsewhere.
+
+For GMP an autoconf test GMP_ASM_X86_SHLDL_CL is used to determine whether
+%cl should be used, and the macros shldl, shrdl, shldw and shrdw in
+mpn/x86/x86-defs.m4 pass through or omit %cl as necessary.  See the comments
+with those macros for usage.
+
+
+
+IMUL INSTRUCTION
+
+GCC config/i386/i386.md (cvs rev 1.187, 21 Oct 00) under *mulsi3_1 notes
+that the following two forms produce identical object code
+
+	imul	$12, %eax
+	imul	$12, %eax, %eax
+
+but that the former isn't accepted by some assemblers, in particular the SCO
+OSR5 COFF assembler.  GMP follows GCC and uses only the latter form.
+
+(This applies only to immediate operands, the three operand form is only
+valid with an immediate.)
+
+
+
+DIRECTION FLAG
+
+The x86 calling conventions say that the direction flag should be clear at
+function entry and exit.  (See iBCS2 and SVR4 ABI books, references below.)
+Although this has been so since the year dot, it's not absolutely clear
+whether it's universally respected.  Since it's better to be safe than
+sorry, GMP follows glibc and does a "cld" if it depends on the direction
+flag being clear.  This happens only in a few places.
+
+
+
+POSITION INDEPENDENT CODE
+
+  Coding Style
+
+    Defining the symbol PIC in m4 processing selects SVR4 / ELF style
+    position independent code.  This is necessary for shared libraries
+    because they can be mapped into different processes at different virtual
+    addresses.  Actually, relocations are allowed but text pages with
+    relocations aren't shared, defeating the purpose of a shared library.
+
+    The GOT is used to access global data, and the PLT is used for
+    functions.  The use of the PLT adds a fixed cost to every function call,
+    and the GOT adds a cost to any function accessing global variables.
+    These are small but might be noticeable when working with small
+    operands.
+
+  Scope
+
+    It's intended, as a matter of policy, that references within libgmp are
+    resolved within libgmp.  Certainly there's no need for an application to
+    replace any internals, and we take the view that there's no value in an
+    application subverting anything documented either.
+
+    Resolving references within libgmp in theory means calls can be made with a
+    plain PC-relative call instruction, which is faster and smaller than going
+    through the PLT, and data references can be similarly PC-relative, saving a
+    GOT entry and fetch from there.  Unfortunately the normal linker behaviour
+    doesn't allow us to do this.
+
+    By default an R_386_PC32 PC-relative reference, either for a call or for
+    data, is left in libgmp.so by the linker so that it can be resolved at
+    runtime to a location in the application or another shared library.  This
+    means a text segment relocation which we don't want.
+
+  -Bsymbolic
+
+    Under the "-Bsymbolic" option, the linker resolves references to symbols
+    within libgmp.so.  This gives us the desired effect for R_386_PC32,
+    ie. it's resolved at link time.  It also resolves R_386_PLT32 calls
+    directly to their target without creating a PLT entry (though if this is
+    done to normal compiler-generated code it still leaves a setup of %ebx
+    to _GLOBAL_OFFSET_TABLE_ which may then be unnecessary).
+
+    Unfortunately -Bsymbolic does bad things to global variables defined in
+    a shared library but accessed by non-PIC code from the mainline (or a
+    static library).
+
+    The problem is that the mainline needs a fixed data address to avoid
+    text segment relocations, so space is allocated in its data segment and
+    the value from the variable is copied from the shared library's data
+    segment when the library is loaded.  Under -Bsymbolic, however,
+    references in the shared library are then resolved still to the shared
+    library data area.  Not surprisingly it bombs badly to have mainline
+    code and library code accessing different locations for what should be
+    one variable.
+
+    Note that this -Bsymbolic effect for the shared library is not just for
+    R_386_PC32 offsets which might have been cooked up in assembler, but is
+    done also for the contents of GOT entries.  -Bsymbolic simply applies a
+    general rule that symbols are resolved first from the local module.
+
+  Visibility Attributes
+
+    GCC __attribute__ ((visibility ("protected"))), which is available in
+    recent versions, eg. 3.3, is probably what we'd like to use.  It makes
+    gcc generate plain PC-relative calls to indicated functions, and directs
+    the linker to resolve references to the given function within the link
+    module.
+
+    Unfortunately, as of debian binutils 2.13.90.0.16 at least, the
+    resulting libgmp.so comes out with text segment relocations, references
+    are not resolved at link time.  If the gcc description is to be believed
+    this is this not how it should work.  If a symbol cannot be overridden
+    by another module then surely references within that module can be
+    resolved immediately (ie. at link time).
+
+  Present
+
+    In any case, all this means that we have no optimizations we can
+    usefully make to function or variable usages, neither for assembler nor
+    C code.  Perhaps in the future the visibility attribute will work as
+    we'd like.
+
+
+
+
+GLOBAL OFFSET TABLE
+
+The magic _GLOBAL_OFFSET_TABLE_ used by code establishing the address of the
+GOT sometimes requires an extra underscore prefix.  SVR4 systems and NetBSD
+don't need a prefix, OpenBSD does need one.  Note that NetBSD and OpenBSD
+are both a.out underscore systems, so the prefix for _GLOBAL_OFFSET_TABLE_
+is not simply the same as the prefix for ordinary globals.
+
+In any case in the asm code we write _GLOBAL_OFFSET_TABLE_ and let a macro
+in x86-defs.m4 add an extra underscore if required (according to a configure
+test).
+
+Old gas 1.92.3 which comes with FreeBSD 2.2.8 gets a segmentation fault when
+asked to assemble the following,
+
+        L1:
+            addl  $_GLOBAL_OFFSET_TABLE_+[.-L1], %ebx
+
+It seems that using the label in the same instruction it refers to is the
+problem, since a nop in between works.  But the simplest workaround is to
+follow gcc and omit the +[.-L1] since it does nothing,
+
+            addl  $_GLOBAL_OFFSET_TABLE_, %ebx
+
+Current gas 2.10 generates incorrect object code when %eax is used in such a
+construction (with or without +[.-L1]),
+
+            addl  $_GLOBAL_OFFSET_TABLE_, %eax
+
+The R_386_GOTPC gets a displacement of 2 rather than the 1 appropriate for
+the 1 byte opcode of "addl $n,%eax".  The best workaround is just to use any
+other register, since then it's a two byte opcode+mod/rm.  GCC for example
+always uses %ebx (which is needed for calls through the PLT).
+
+A similar problem occurs in an leal (again with or without a +[.-L1]),
+
+            leal  _GLOBAL_OFFSET_TABLE_(%edi), %ebx
+
+This time the R_386_GOTPC gets a displacement of 0 rather than the 2
+appropriate for the opcode and mod/rm, making this form unusable.
+
+
+
+
+SIMPLE LOOPS
+
+The overheads in setting up for an unrolled loop can mean that at small
+sizes a simple loop is faster.  Making small sizes go fast is important,
+even if it adds a cycle or two to bigger sizes.  To this end various
+routines choose between a simple loop and an unrolled loop according to
+operand size.  The path to the simple loop, or to special case code for
+small sizes, is always as fast as possible.
+
+Adding a simple loop requires a conditional jump to choose between the
+simple and unrolled code.  The size of a branch misprediction penalty
+affects whether a simple loop is worthwhile.
+
+The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
+point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
+UNROLL_THRESHOLD using the unrolled loop.  If position independent code adds
+a couple of cycles to an unrolled loop setup, the threshold will vary with
+PIC or non-PIC.  Something like the following is typical.
+
+	deflit(UNROLL_THRESHOLD, ifdef(`PIC',10,8))
+
+There's no automated way to determine the threshold.  Setting it to a small
+value and then to a big value makes it possible to measure the simple and
+unrolled loops each over a range of sizes, from which the crossover point
+can be determined.  Alternately, just adjust the threshold up or down until
+there's no more speedups.
+
+
+
+UNROLLED LOOP CODING
+
+The x86 addressing modes allow a byte displacement of -128 to +127, making
+it possible to access 256 bytes, which is 64 limbs, without adjusting
+pointer registers within the loop.  Dword sized displacements can be used
+too, but they increase code size, and unrolling to 64 ought to be enough.
+
+When unrolling to the full 64 limbs/loop, the limb at the top of the loop
+will have a displacement of -128, so pointers have to have a corresponding
++128 added before entering the loop.  When unrolling to 32 limbs/loop
+displacements 0 to 127 can be used with 0 at the top of the loop and no
+adjustment needed to the pointers.
+
+Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
+limbs/loop is selected.  Usually the gain in speed using 64 instead of 32 or
+16 is small, so support for 64 limbs/loop is generally only for comparison.
+
+
+
+COMPUTED JUMPS
+
+When working from least significant limb to most significant limb (most
+routines) the computed jump and pointer calculations in preparation for an
+unrolled loop are as follows.
+
+	S = operand size in limbs
+	N = number of limbs per loop (UNROLL_COUNT)
+	L = log2 of unrolling (UNROLL_LOG2)
+	M = mask for unrolling (UNROLL_MASK)
+	C = code bytes per limb in the loop
+	B = bytes per limb (4 for x86)
+
+	computed jump            (-S & M) * C + entrypoint
+	subtract from pointers   (-S & M) * B
+	initial loop counter     (S-1) >> L
+	displacements            0 to B*(N-1)
+
+The loop counter is decremented at the end of each loop, and the looping
+stops when the decrement takes the counter to -1.  The displacements are for
+the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
+
+Usually the multiply by "C" can be handled without an imul, using instead an
+leal, or a shift and subtract.
+
+When working from most significant to least significant limb (eg. mpn_lshift
+and mpn_copyd), the calculations change as follows.
+
+	add to pointers          (-S & M) * B
+	displacements            0 to -B*(N-1)
+
+
+
+OLD GAS 1.92.3
+
+This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
+affect GMP code.
+
+Firstly, an expression involving two forward references to labels comes out
+as zero.  For example,
+
+		addl	$bar-foo, %eax
+	foo:
+		nop
+	bar:
+
+This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
+When only one forward reference is involved, it works correctly, as for
+example,
+
+	foo:
+		addl	$bar-foo, %eax
+		nop
+	bar:
+
+Secondly, an expression involving two labels can't be used as the
+displacement for an leal.  For example,
+
+	foo:
+		nop
+	bar:
+		leal	bar-foo(%eax,%ebx,8), %ecx
+
+A slightly cryptic error is given, "Unimplemented segment type 0 in
+parse_operand".  When only one label is used it's ok, and the label can be a
+forward reference too, as for example,
+
+		leal	foo(%eax,%ebx,8), %ecx
+		nop
+	foo:
+
+These problems only affect PIC computed jump calculations.  The workarounds
+are just to do an leal without a displacement and then an addl, and to make
+sure the code is placed so that there's at most one forward reference in the
+addl.
+
+
+
+REFERENCES
+
+"Intel Architecture Software Developer's Manual", volumes 1, 2a, 2b, 3a, 3b,
+2006, order numbers 253665 through 253669.  Available on-line,
+
+	ftp://download.intel.com/design/Pentium4/manuals/25366518.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366618.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366718.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366818.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366918.pdf
+
+
+"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
+published by Prentice Hall, ISBN 0-13-880410-9.  And the "Intel386 Processor
+Supplement", AT&T, 1991, ISBN 0-13-877689-X.  These have details of calling
+conventions and ELF shared library PIC coding.  Versions of both available
+on-line,
+
+	http://www.sco.com/developer/devspecs
+
+"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
+published by McGraw-Hill, 1991, ISBN 0-07-031219-2.  (Same as the above 386
+ABI supplement.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/aors_n.asm b/third_party/gmp/mpn/x86/aors_n.asm
new file mode 100644
index 0000000..5d359f5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/aors_n.asm

@@ -0,0 +1,202 @@
+dnl  x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1992, 1994-1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P5	3.375
+C P6	3.125
+C K6	3.5
+C K7	2.25
+C P4	8.75
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                    mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_nc)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	L(oopgo)		C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
+	call	L(0a)
+L(0a):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$L(oop)-L(0a)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+	leal	L(oop)-3(%eax,%eax,8),%eax
+')
+
+	C These lines initialize carry from the 5th parameter.  Should be
+	C possible to simplify.
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	jmp	*%eax			C jump into loop
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(M4_function_n)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	L(oop)			C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
+	call	L(0b)
+L(0b):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$L(oop)-L(0b)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+	leal	L(oop)-3(%eax,%eax,8),%eax
+')
+	jmp	*%eax			C jump into loop
+
+L(oopgo):
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	ALIGN(16)
+L(oop):	movl	(%esi),%eax
+	M4_inst	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	M4_inst	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	M4_inst	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	M4_inst	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	M4_inst	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	M4_inst	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	M4_inst	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	M4_inst	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/aorsmul_1.asm b/third_party/gmp/mpn/x86/aorsmul_1.asm
new file mode 100644
index 0000000..54a8905
--- /dev/null
+++ b/third_party/gmp/mpn/x86/aorsmul_1.asm

@@ -0,0 +1,156 @@
+dnl  x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
+dnl  limb and add the result to a second limb vector.
+
+dnl  Copyright 1992, 1994, 1997, 1999-2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				14.75
+C P6 model 0-8,10-12		 7.5
+C P6 model 9  (Banias)		 6.7
+C P6 model 13 (Dothan)		 6.75
+C P4 model 0  (Willamette)	24.0
+C P4 model 1  (?)		24.0
+C P4 model 2  (Northwood)	24.0
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom
+C AMD K6			12.5
+C AMD K7			 5.25
+C AMD K8
+C AMD K10
+
+
+ifdef(`OPERATION_addmul_1',`
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1',`
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
+define(PARAM_SIZE,       `FRAME+12(%esp)')
+define(PARAM_SRC,        `FRAME+8(%esp)')
+define(PARAM_DST,        `FRAME+4(%esp)')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	M4_inst	%eax,(%edi)
+	adcl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	M4_inst	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/aorrlsh1_n.asm b/third_party/gmp/mpn/x86/atom/aorrlsh1_n.asm
new file mode 100644
index 0000000..cd1a650
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorrlsh1_n.asm

@@ -0,0 +1,53 @@
+dnl  Intel Atom mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 31)
+
+ifdef(`OPERATION_addlsh1_n', `
+	define(M4_inst,        adc)
+	define(M4_opp,         sub)
+	define(M4_function,    mpn_addlsh1_n)
+	define(M4_function_c,  mpn_addlsh1_nc)
+',`ifdef(`OPERATION_rsblsh1_n', `
+	define(M4_inst,        sbb)
+	define(M4_opp,         add)
+	define(M4_function,    mpn_rsblsh1_n)
+	define(M4_function_c,  mpn_rsblsh1_nc)
+',`m4_error(`Need OPERATION_addlsh1_n or OPERATION_rsblsh1_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86/atom/aorrlsh2_n.asm b/third_party/gmp/mpn/x86/atom/aorrlsh2_n.asm
new file mode 100644
index 0000000..10f4419
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorrlsh2_n.asm

@@ -0,0 +1,53 @@
+dnl  Intel Atom mpn_addlsh2_n/mpn_rsblsh2_n -- rp[] = (vp[] << 2) +- up[]
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+	define(M4_inst,        adcl)
+	define(M4_opp,         subl)
+	define(M4_function,    mpn_addlsh2_n)
+	define(M4_function_c,  mpn_addlsh2_nc)
+',`ifdef(`OPERATION_rsblsh2_n', `
+	define(M4_inst,        sbbl)
+	define(M4_opp,         addl)
+	define(M4_function,    mpn_rsblsh2_n)
+	define(M4_function_c,  mpn_rsblsh2_nc)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_rsblsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n mpn_rsblsh2_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86/atom/aorrlshC_n.asm b/third_party/gmp/mpn/x86/atom/aorrlshC_n.asm
new file mode 100644
index 0000000..71cfe49
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorrlshC_n.asm

@@ -0,0 +1,156 @@
+dnl  Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[]
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_signed_limb_t carry);
+
+C				cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CORB,	20)
+defframe(PARAM_SIZE,	16)
+defframe(PARAM_DBLD,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+dnl  re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_DBLD')
+define(SAVE_VP,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebx')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+	movl	PARAM_CORB, %eax
+	movl	%eax, %edx
+	shr	$LSH, %edx
+	andl	$1, %edx
+	M4_opp	%edx, %eax
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	xor	%eax, %eax
+	xor	%edx, %edx
+L(start_nc):
+	push	rp			FRAME_pushl()
+
+	mov	PARAM_SIZE, %ecx	C size
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	incl	%ecx			C size + 1
+	mov	PARAM_SRC, up
+	mov	vp, SAVE_VP
+	shr	%ecx			C (size+1)\2
+	mov	PARAM_DBLD, vp
+	mov	%ebp, SAVE_EBP
+	mov	%ecx, VAR_COUNT
+	jnc	L(entry)		C size odd
+
+	shr	%edx			C size even
+	mov	(vp), %ecx
+	lea	4(vp), vp
+	lea	(%eax,%ecx,M), %edx
+	mov	%ecx, %eax
+	lea	-4(up), up
+	lea	-4(rp), rp
+	jmp	L(enteven)
+
+	ALIGN(16)
+L(oop):
+	lea	(%eax,%ecx,M), %ebp
+	shr	$RSH, %ecx
+	mov	4(vp), %eax
+	shr	%edx
+	lea	8(vp), vp
+	M4_inst	(up), %ebp
+	lea	(%ecx,%eax,M), %edx
+	mov	%ebp, (rp)
+L(enteven):
+	M4_inst	4(up), %edx
+	lea	8(up), up
+	mov	%edx, 4(rp)
+	adc	%edx, %edx
+	shr	$RSH, %eax
+	lea	8(rp), rp
+L(entry):
+	mov	(vp), %ecx
+	decl	VAR_COUNT
+	jnz	L(oop)
+
+	lea	(%eax,%ecx,M), %ebp
+	shr	$RSH, %ecx
+	shr	%edx
+	mov	SAVE_VP, vp
+	M4_inst	(up), %ebp
+	mov	%ecx, %eax
+	mov	SAVE_UP, up
+	M4_inst	$0, %eax
+	mov	%ebp, (rp)
+	mov	SAVE_EBP, %ebp
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/aors_n.asm b/third_party/gmp/mpn/x86/atom/aors_n.asm
new file mode 100644
index 0000000..45ec287
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aors_n.asm

@@ -0,0 +1,159 @@
+dnl  Intel Atom mpn_add_n/mpn_sub_n -- rp[] = up[] +- vp[].
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 3
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                         mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                   mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebx')
+define(`cy',  `%ecx')
+define(`r1',  `%ecx')
+define(`r2',  `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_n)
+	xor	cy, cy			C carry
+L(start):
+	mov	PARAM_SIZE, %eax	C size
+	mov	rp, SAVE_RP
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC1, up
+	shr	%eax			C size >> 1
+	mov	vp, SAVE_VP
+	mov	PARAM_SRC2, vp
+	jz	L(one)			C size == 1
+	jc	L(three)		C size % 2 == 1
+
+	shr	cy
+	mov	(up), r2
+	lea	4(up), up
+	lea	4(vp), vp
+	lea	-4(rp), rp
+	jmp	L(entry)
+L(one):
+	shr	cy
+	mov	(up), r1
+	jmp	L(end)
+L(three):
+	shr	cy
+	mov	(up), r1
+
+	ALIGN(16)
+L(oop):
+	M4_inst	(vp), r1
+	lea	8(up), up
+	mov	-4(up), r2
+	lea	8(vp), vp
+	mov	r1, (rp)
+L(entry):
+	M4_inst	-4(vp), r2
+	lea	8(rp), rp
+	dec	%eax
+	mov	(up), r1
+	mov	r2, -4(rp)
+	jnz	L(oop)
+
+L(end):					C %eax is zero here
+	mov	SAVE_UP, up
+	M4_inst	(vp), r1
+	mov	SAVE_VP, vp
+	mov	r1, (rp)
+	adc	%eax, %eax
+	mov	SAVE_RP, rp
+	ret
+EPILOGUE()
+
+PROLOGUE(M4_function_nc)
+	mov	PARAM_CARRY, cy		C carry
+	jmp	L(start)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/aorslshC_n.asm b/third_party/gmp/mpn/x86/atom/aorslshC_n.asm
new file mode 100644
index 0000000..75ace65
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorslshC_n.asm

@@ -0,0 +1,247 @@
+dnl  Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C)
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C				mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C				mp_signed_limb_t borrow);
+
+defframe(PARAM_CORB,	16)
+defframe(PARAM_SIZE,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size,);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t borrow);
+
+C if src1 == dst, _ip1 is used
+
+C					cycles/limb
+C				dst!=src1,src2	dst==src1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 7		 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(GPARAM_CORB,	20)
+defframe(GPARAM_SIZE,	16)
+defframe(GPARAM_SRC2,	12)
+
+dnl  re-use parameter space
+define(SAVE_EBP,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp',  `%edi')
+define(`up',  `%esi')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_ip_function_c)
+deflit(`FRAME',0)
+	movl	PARAM_CORB, %ecx
+	movl	%ecx, %edx
+	shr	$LSH, %edx
+	andl	$1, %edx
+	M4_opp	%edx, %ecx
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_ip_function)
+deflit(`FRAME',0)
+
+	xor	%ecx, %ecx
+	xor	%edx, %edx
+L(start_nc):
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SIZE, %ebx	C size
+L(inplace):
+	incl	%ebx			C size + 1
+	shr	%ebx			C (size+1)\2
+	mov	%ebp, SAVE_EBP
+	jnc	L(entry)		C size odd
+
+	add	%edx, %edx		C size even
+	mov	%ecx, %ebp
+	mov	(up), %ecx
+	lea	-4(rp), rp
+	lea	(%ebp,%ecx,M), %eax
+	lea	4(up), up
+	jmp	L(enteven)
+
+	ALIGN(16)
+L(oop):
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	mov	4(up), %ecx
+	add	%edx, %edx
+	lea	8(up), up
+	M4_inst	%ebp, (rp)
+	lea	(%eax,%ecx,M), %eax
+
+L(enteven):
+	M4_inst	%eax, 4(rp)
+	lea	8(rp), rp
+
+	sbb	%edx, %edx
+	shr	$RSH, %ecx
+
+L(entry):
+	mov	(up), %eax
+	decl	%ebx
+	jnz	L(oop)
+
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	shr	%edx
+	M4_inst	%ebp, (rp)
+	mov	SAVE_UP, up
+	adc	$0, %eax
+	mov	SAVE_EBP, %ebp
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+	movl	GPARAM_CORB, %ecx
+	movl	%ecx, %edx
+	shr	$LSH, %edx
+	andl	$1, %edx
+	M4_opp	%edx, %ecx
+	jmp	L(generic_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	xor	%ecx, %ecx
+	xor	%edx, %edx
+L(generic_nc):
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	cmp	rp, up
+	mov	%ebx, SAVE_EBX
+	jne	L(general)
+	mov	GPARAM_SIZE, %ebx	C size
+	mov	GPARAM_SRC2, up
+	jmp	L(inplace)
+
+L(general):
+	mov	GPARAM_SIZE, %eax	C size
+	mov	%ebx, SAVE_EBX
+	incl	%eax			C size + 1
+	mov	up, %ebx		C vp
+	mov	GPARAM_SRC2, up		C up
+	shr	%eax			C (size+1)\2
+	mov	%ebp, SAVE_EBP
+	mov	%eax, GPARAM_SIZE
+	jnc	L(entry2)		C size odd
+
+	add	%edx, %edx		C size even
+	mov	%ecx, %ebp
+	mov	(up), %ecx
+	lea	-4(rp), rp
+	lea	-4(%ebx), %ebx
+	lea	(%ebp,%ecx,M), %eax
+	lea	4(up), up
+	jmp	L(enteven2)
+
+	ALIGN(16)
+L(oop2):
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	mov	4(up), %ecx
+	add	%edx, %edx
+	lea	8(up), up
+	mov	(%ebx), %edx
+	M4_inst	%ebp, %edx
+	lea	(%eax,%ecx,M), %eax
+	mov	%edx, (rp)
+L(enteven2):
+	mov	4(%ebx), %edx
+	lea	8(%ebx), %ebx
+	M4_inst	%eax, %edx
+	mov	%edx, 4(rp)
+	sbb	%edx, %edx
+	shr	$RSH, %ecx
+	lea	8(rp), rp
+L(entry2):
+	mov	(up), %eax
+	decl	GPARAM_SIZE
+	jnz	L(oop2)
+
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	shr	%edx
+	mov	(%ebx), %edx
+	M4_inst	%ebp, %edx
+	mov	%edx, (rp)
+	mov	SAVE_UP, up
+	adc	$0, %eax
+	mov	SAVE_EBP, %ebp
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/bdiv_q_1.asm b/third_party/gmp/mpn/x86/atom/bdiv_q_1.asm
new file mode 100644
index 0000000..31e908e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/bdiv_q_1.asm

@@ -0,0 +1,35 @@
+dnl  Intel Atom mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel
+dnl  division by 1-limb divisor, returning quotient only.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+include_mpn(`x86/pentium/bdiv_q_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/cnd_add_n.asm b/third_party/gmp/mpn/x86/atom/cnd_add_n.asm
new file mode 100644
index 0000000..50bf2ad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/cnd_add_n.asm

@@ -0,0 +1,113 @@
+dnl  X86 mpn_cnd_add_n optimised for Intel Atom.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 ?
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 ?
+C P4 model 3-4 (Prescott)	 ?
+C Intel atom			 4.67
+C AMD K6			 ?
+C AMD K7			 ?
+C AMD K8			 ?
+
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+define(`n',   `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_cnd_add_n)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+
+	mov	cnd, %eax		C make cnd into a mask (1)
+	mov	24(%esp), rp
+	neg	%eax			C make cnd into a mask (1)
+	mov	28(%esp), up
+	sbb	%eax, %eax		C make cnd into a mask (1)
+	mov	32(%esp), vp
+	mov	%eax, cnd		C make cnd into a mask (1)
+	mov	36(%esp), n
+
+	xor	%edx, %edx
+
+	shr	$1, n
+	jnc	L(top)
+
+	mov	0(vp), %eax
+	and	cnd, %eax
+	lea	4(vp), vp
+	add	0(up), %eax
+	lea	4(rp), rp
+	lea	4(up), up
+	sbb	%edx, %edx
+	mov	%eax, -4(rp)
+	inc	n
+	dec	n
+	je	L(end)
+
+L(top):	sbb	%edx, %edx
+	mov	0(vp), %eax
+	and	cnd, %eax
+	lea	8(vp), vp
+	lea	8(rp), rp
+	mov	-4(vp), %ebx
+	and	cnd, %ebx
+	add	%edx, %edx
+	adc	0(up), %eax
+	lea	8(up), up
+	mov	%eax, -8(rp)
+	adc	-4(up), %ebx
+	dec	n
+	mov	%ebx, -4(rp)
+	jne	L(top)
+
+L(end):	mov	$0, %eax
+	adc	%eax, %eax
+
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/cnd_sub_n.asm b/third_party/gmp/mpn/x86/atom/cnd_sub_n.asm
new file mode 100644
index 0000000..221bedc
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/cnd_sub_n.asm

@@ -0,0 +1,124 @@
+dnl  X86 mpn_cnd_sub_n optimised for Intel Atom.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 ?
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 ?
+C P4 model 3-4 (Prescott)	 ?
+C Intel atom			 5.67
+C AMD K6			 ?
+C AMD K7			 ?
+C AMD K8			 ?
+
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+define(`n',   `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_cnd_sub_n)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+
+	mov	cnd, %eax		C make cnd into a mask (1)
+	mov	24(%esp), rp
+	neg	%eax			C make cnd into a mask (1)
+	mov	28(%esp), up
+	sbb	%eax, %eax		C make cnd into a mask (1)
+	mov	32(%esp), vp
+	mov	%eax, cnd		C make cnd into a mask (1)
+	mov	36(%esp), n
+
+	xor	%edx, %edx
+
+	inc	n
+	shr	n
+	jnc	L(ent)
+
+	mov	0(vp), %eax
+	and	cnd, %eax
+	lea	4(vp), vp
+	mov	0(up), %edx
+	sub	%eax, %edx
+	lea	4(rp), rp
+	lea	4(up), up
+	mov	%edx, -4(rp)
+	sbb	%edx, %edx		C save cy
+
+L(ent):	mov	0(vp), %ebx
+	and	cnd, %ebx
+	add	%edx, %edx		C restore cy
+	mov	0(up), %edx
+	dec	n
+	je	L(end)
+
+L(top):	sbb	%ebx, %edx
+	mov	4(vp), %eax
+	mov	%edx, 0(rp)
+	sbb	%edx, %edx		C save cy
+	mov	8(vp), %ebx
+	lea	8(up), up
+	and	cnd, %ebx
+	and	cnd, %eax
+	add	%edx, %edx		C restore cy
+	mov	-4(up), %edx
+	lea	8(rp), rp
+	sbb	%eax, %edx
+	mov	%edx, -4(rp)
+	dec	n
+	mov	0(up), %edx
+	lea	8(vp), vp
+	jne	L(top)
+
+L(end):	sbb	%ebx, %edx
+	mov	%edx, 0(rp)
+
+	mov	$0, %eax
+	adc	%eax, %eax
+
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/dive_1.asm b/third_party/gmp/mpn/x86/atom/dive_1.asm
new file mode 100644
index 0000000..71036a1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/dive_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_1)
+include_mpn(`x86/pentium/dive_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/gmp-mparam.h b/third_party/gmp/mpn/x86/atom/gmp-mparam.h
new file mode 100644
index 0000000..e025bb7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/gmp-mparam.h

@@ -0,0 +1,214 @@
+/* Intel Atom/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1600 MHz Diamondville (Atom 330) */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     17
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 72.60% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           35
+
+#define DIV_1_VS_MUL_1_PERCENT             236
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               178
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               399
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     126
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     115
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                366
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    404, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159,11}, {     95,10}, {    191, 9}, {    383,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    351, 9}, {    703,10}, \
+    {    367, 9}, {    735,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    735,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    735,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1279,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    831,11}, {   1663,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1215,13}, {    639,12}, {   1471,13}, \
+    {    767,12}, {   1599,13}, {    895,12}, {   1791,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1407,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3839,15}, \
+    {   1023,14}, {   2047,13}, {   4223,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 158
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             368  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    368, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     13, 5}, {     27, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     15, 6}, {     31, 7}, {     17, 6}, \
+    {     35, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    159, 8}, \
+    {    319,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287, 9}, \
+    {    575,10}, {    303, 9}, {    607,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,13}, \
+    {    383,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1599,13}, {    895,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,14}, \
+    {    767,13}, {   1663,12}, {   3455,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4351,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,15}, {   1023,14}, \
+    {   2047,13}, {   4351,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  56
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                 111
+#define SQRLO_SQR_THRESHOLD               6654
+
+#define DC_DIV_QR_THRESHOLD                 67
+#define DC_DIVAPPR_Q_THRESHOLD             252
+#define DC_BDIV_QR_THRESHOLD                63
+#define DC_BDIV_Q_THRESHOLD                172
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               250
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              276
+#define REDC_1_TO_REDC_N_THRESHOLD          68
+
+#define MU_DIV_QR_THRESHOLD               1334
+#define MU_DIVAPPR_Q_THRESHOLD            1442
+#define MUPI_DIV_QR_THRESHOLD              116
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1341
+
+#define POWM_SEC_TABLE  1,16,98,376,1259
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        23
+#define SET_STR_DC_THRESHOLD               298
+#define SET_STR_PRECOMPUTE_THRESHOLD      1037
+
+#define FAC_DSC_THRESHOLD                  171
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    3  /* 3.71% faster than 1 */
+#define HGCD_THRESHOLD                     128
+#define HGCD_APPR_THRESHOLD                186
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   465
+#define GCDEXT_DC_THRESHOLD                339
+#define JACOBI_BASE_METHOD                   3  /* 2.58% faster than 2 */
+
+/* Tuneup completed successfully, took 214190 seconds */

diff --git a/third_party/gmp/mpn/x86/atom/logops_n.asm b/third_party/gmp/mpn/x86/atom/logops_n.asm
new file mode 100644
index 0000000..3cb6d73
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/logops_n.asm

@@ -0,0 +1,151 @@
+dnl  Intel Atom mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C				   cycles/limb
+C				op	nop	opn
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 3	 3.5	 3.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4_inst',     `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n,     , andl,    )
+M4_choose_op( andn_n,    , andl, yes)
+M4_choose_op( nand_n, yes, andl,    )
+M4_choose_op( ior_n,     ,  orl,    )
+M4_choose_op( iorn_n,    ,  orl, yes)
+M4_choose_op( nior_n, yes,  orl,    )
+M4_choose_op( xor_n,     , xorl,    )
+M4_choose_op( xnor_n, yes, xorl,    )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C void M4_function (mp_ptr dst, mp_srcptr src2, mp_srcptr src1, mp_size_t size);
+C
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebx')
+define(`cnt', `%eax')
+define(`r1',  `%ecx')
+define(`r2',  `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function)
+	mov	PARAM_SIZE, cnt		C size
+	mov	rp, SAVE_RP
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC1, up
+	shr	cnt			C size >> 1
+	mov	vp, SAVE_VP
+	mov	PARAM_SRC2, vp
+	mov	(up), r1
+	jz	L(end)			C size == 1
+	jnc	L(even)			C size % 2 == 0
+
+	ALIGN(16)
+L(oop):
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(r1)')
+	M4_inst	(vp), r1
+	lea	8(up), up
+	mov	-4(up), r2
+M4post(`	notl_or_xorl_GMP_NUMB_MASK(r1)')
+	lea	8(vp), vp
+	mov	r1, (rp)
+L(entry):
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(r2)')
+	M4_inst	-4(vp), r2
+	lea	8(rp), rp
+M4post(`	notl_or_xorl_GMP_NUMB_MASK(r2)')
+	dec	cnt
+	mov	(up), r1
+	mov	r2, -4(rp)
+	jnz	L(oop)
+
+L(end):
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(r1)')
+	mov	SAVE_UP, up
+	M4_inst	(vp), r1
+M4post(`notl_or_xorl_GMP_NUMB_MASK(r1)')
+	mov	SAVE_VP, vp
+	mov	r1, (rp)
+	mov	SAVE_RP, rp
+	ret
+
+L(even):
+	mov	r1, r2
+	lea	4(up), up
+	lea	4(vp), vp
+	lea	-4(rp), rp
+	jmp	L(entry)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/lshift.asm b/third_party/gmp/mpn/x86/atom/lshift.asm
new file mode 100644
index 0000000..f2c70dd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/lshift.asm

@@ -0,0 +1,218 @@
+dnl  Intel Atom mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C			unsigned cnt);
+
+C				  cycles/limb
+C				cnt!=1	cnt==1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 5	 2.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`cnt',  `%ecx')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_lshift)
+	mov	PARAM_CNT, cnt
+	mov	PARAM_SIZE, %edx
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,cnt
+	jne	L(normal)
+	cmpl	rp, up
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(up,%edx,4),%eax
+	cmpl	%eax,rp
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	lea	-4(up,%edx,4), up
+	mov	%ebx, SAVE_EBX
+	lea	-4(rp,%edx,4), rp
+
+	shr	%edx
+	mov	(up), %eax
+	mov	%edx, VAR_COUNT
+	jnc	L(evn)
+
+	mov	%eax, %ebx
+	shl	%cl, %ebx
+	neg	cnt
+	shr	%cl, %eax
+	test	%edx, %edx
+	jnz	L(gt1)
+	mov	%ebx, (rp)
+	jmp	L(quit)
+
+L(gt1):	mov	%ebp, SAVE_EBP
+	push	%eax
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	shr	%cl, %eax
+	jmp	L(lo1)
+
+L(evn):	mov	%ebp, SAVE_EBP
+	neg	cnt
+	mov	%eax, %ebp
+	mov	-4(up), %edx
+	shr	%cl, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	neg	cnt
+	decl	VAR_COUNT
+	lea	4(rp), rp
+	lea	-4(up), up
+	jz	L(end)
+	push	%eax			FRAME_pushl()
+
+	ALIGN(8)
+L(top):	shl	%cl, %ebp
+	or	%ebp, %edx
+	shl	%cl, %ebx
+	neg	cnt
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	mov	%edx, -4(rp)
+	shr	%cl, %eax
+	lea	-8(rp), rp
+L(lo1):	mov	-8(up), %edx
+	or	%ebx, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	lea	-8(up), up
+	neg	cnt
+	mov	%eax, (rp)
+	decl	VAR_COUNT
+	jg	L(top)
+
+	pop	%eax			FRAME_popl()
+L(end):
+	shl	%cl, %ebp
+	shl	%cl, %ebx
+	or	%ebp, %edx
+	mov	SAVE_EBP, %ebp
+	mov	%edx, -4(rp)
+	mov	%ebx, -8(rp)
+
+L(quit):
+	mov	SAVE_UP, up
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+
+L(special):
+deflit(`FRAME',4)
+	lea	3(%edx), %eax		C size + 3
+	dec	%edx			C size - 1
+	mov	(up), %ecx
+	shr	$2, %eax		C (size + 3) / 4
+	and	$3, %edx		C (size - 1) % 4
+	jz	L(goloop)		C jmp if  size == 1 (mod 4)
+	shr	%edx
+	jnc	L(odd)			C jum if  size == 3 (mod 4)
+
+	add	%ecx, %ecx
+	lea	4(up), up
+	mov	%ecx, (rp)
+	mov	(up), %ecx
+	lea	4(rp), rp
+
+	dec	%edx
+	jnz	L(goloop)		C jump if  size == 0 (mod 4)
+L(odd):	lea	-8(up), up
+	lea	-8(rp), rp
+	jmp	L(sentry)		C reached if size == 2 or 3 (mod 4)
+
+L(sloop):
+	adc	%ecx, %ecx
+	mov	4(up), %edx
+	mov	%ecx, (rp)
+	adc	%edx, %edx
+	mov	8(up), %ecx
+	mov	%edx, 4(rp)
+L(sentry):
+	adc	%ecx, %ecx
+	mov	12(up), %edx
+	mov	%ecx, 8(rp)
+	adc	%edx, %edx
+	lea	16(up), up
+	mov	%edx, 12(rp)
+	lea	16(rp), rp
+	mov	(up), %ecx
+L(goloop):
+	decl	%eax
+	jnz	L(sloop)
+
+L(squit):
+	adc	%ecx, %ecx
+	mov	%ecx, (rp)
+	adc	%eax, %eax
+
+	mov	SAVE_UP, up
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/lshiftc.asm b/third_party/gmp/mpn/x86/atom/lshiftc.asm
new file mode 100644
index 0000000..5be53ed
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/lshiftc.asm

@@ -0,0 +1,159 @@
+dnl  Intel Atom mpn_lshiftc -- mpn left shift with complement.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshiftc (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C			 unsigned cnt);
+
+C				cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 5.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`cnt',  `%ecx')
+
+ASM_START()
+	TEXT
+
+PROLOGUE(mpn_lshiftc)
+deflit(`FRAME',0)
+	mov	PARAM_CNT, cnt
+	mov	PARAM_SIZE, %edx
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+
+	lea	-4(up,%edx,4), up
+	mov	%ebx, SAVE_EBX
+	lea	-4(rp,%edx,4), rp
+
+	shr	%edx
+	mov	(up), %eax
+	mov	%edx, VAR_COUNT
+	jnc	L(evn)
+
+	mov	%eax, %ebx
+	shl	%cl, %ebx
+	neg	cnt
+	shr	%cl, %eax
+	test	%edx, %edx
+	jnz	L(gt1)
+	not	%ebx
+	mov	%ebx, (rp)
+	jmp	L(quit)
+
+L(gt1):	mov	%ebp, SAVE_EBP
+	push	%eax
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	shr	%cl, %eax
+	jmp	L(lo1)
+
+L(evn):	mov	%ebp, SAVE_EBP
+	neg	cnt
+	mov	%eax, %ebp
+	mov	-4(up), %edx
+	shr	%cl, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	neg	cnt
+	decl	VAR_COUNT
+	lea	4(rp), rp
+	lea	-4(up), up
+	jz	L(end)
+	push	%eax			FRAME_pushl()
+
+L(top):	shl	%cl, %ebp
+	or	%ebp, %edx
+	shl	%cl, %ebx
+	neg	cnt
+	not	%edx
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	mov	%edx, -4(rp)
+	shr	%cl, %eax
+	lea	-8(rp), rp
+L(lo1):	mov	-8(up), %edx
+	or	%ebx, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	not	%eax
+	lea	-8(up), up
+	neg	cnt
+	mov	%eax, (rp)
+	decl	VAR_COUNT
+	jg	L(top)
+
+	pop	%eax			FRAME_popl()
+L(end):
+	shl	%cl, %ebp
+	shl	%cl, %ebx
+	or	%ebp, %edx
+	mov	SAVE_EBP, %ebp
+	not	%edx
+	not	%ebx
+	mov	%edx, -4(rp)
+	mov	%ebx, -8(rp)
+
+L(quit):
+	mov	SAVE_UP, up
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/mmx/copyd.asm b/third_party/gmp/mpn/x86/atom/mmx/copyd.asm
new file mode 100644
index 0000000..b80fb03
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mmx/copyd.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86/k7/mmx/copyd.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mmx/copyi.asm b/third_party/gmp/mpn/x86/atom/mmx/copyi.asm
new file mode 100644
index 0000000..49b6b8d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mmx/copyi.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86/k7/mmx/copyi.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mmx/hamdist.asm b/third_party/gmp/mpn/x86/atom/mmx/hamdist.asm
new file mode 100644
index 0000000..3fe8253
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mmx/hamdist.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mod_34lsub1.asm b/third_party/gmp/mpn/x86/atom/mod_34lsub1.asm
new file mode 100644
index 0000000..6d57ba3
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mod_34lsub1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_34lsub1)
+include_mpn(`x86/p6/mod_34lsub1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mode1o.asm b/third_party/gmp/mpn/x86/atom/mode1o.asm
new file mode 100644
index 0000000..c9ee6bd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mode1o.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_modexact_1_odd mpn_modexact_1c_odd)
+include_mpn(`x86/pentium/mode1o.asm')

diff --git a/third_party/gmp/mpn/x86/atom/rshift.asm b/third_party/gmp/mpn/x86/atom/rshift.asm
new file mode 100644
index 0000000..1cb5dbe
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/rshift.asm

@@ -0,0 +1,152 @@
+dnl  Intel Atom mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Converted from AMD64 by Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C			unsigned cnt);
+
+C				cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`cnt',  `%ecx')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_rshift)
+	mov	PARAM_CNT, cnt
+	mov	PARAM_SIZE, %edx
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+	mov	%ebx, SAVE_EBX
+
+	shr	%edx
+	mov	(up), %eax
+	mov	%edx, VAR_COUNT
+	jnc	L(evn)
+
+	mov	%eax, %ebx
+	shr	%cl, %ebx
+	neg	cnt
+	shl	%cl, %eax
+	test	%edx, %edx
+	jnz	L(gt1)
+	mov	%ebx, (rp)
+	jmp	L(quit)
+
+L(gt1):	mov	%ebp, SAVE_EBP
+	push	%eax
+	mov	4(up), %eax
+	mov	%eax, %ebp
+	shl	%cl, %eax
+	jmp	L(lo1)
+
+L(evn):	mov	%ebp, SAVE_EBP
+	neg	cnt
+	mov	%eax, %ebp
+	mov	4(up), %edx
+	shl	%cl, %eax
+	mov	%edx, %ebx
+	shl	%cl, %edx
+	neg	cnt
+	decl	VAR_COUNT
+	lea	-4(rp), rp
+	lea	4(up), up
+	jz	L(end)
+	push	%eax			FRAME_pushl()
+
+	ALIGN(8)
+L(top):	shr	%cl, %ebp
+	or	%ebp, %edx
+	shr	%cl, %ebx
+	neg	cnt
+	mov	4(up), %eax
+	mov	%eax, %ebp
+	mov	%edx, 4(rp)
+	shl	%cl, %eax
+	lea	8(rp), rp
+L(lo1):	mov	8(up), %edx
+	or	%ebx, %eax
+	mov	%edx, %ebx
+	shl	%cl, %edx
+	lea	8(up), up
+	neg	cnt
+	mov	%eax, (rp)
+	decl	VAR_COUNT
+	jg	L(top)
+
+	pop	%eax			FRAME_popl()
+L(end):
+	shr	%cl, %ebp
+	shr	%cl, %ebx
+	or	%ebp, %edx
+	mov	SAVE_EBP, %ebp
+	mov	%edx, 4(rp)
+	mov	%ebx, 8(rp)
+
+L(quit):
+	mov	SAVE_UP, up
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/aorsmul_1.asm b/third_party/gmp/mpn/x86/atom/sse2/aorsmul_1.asm
new file mode 100644
index 0000000..969a14a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/aorsmul_1.asm

@@ -0,0 +1,174 @@
+dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C			    cycles/limb
+C P5				 -
+C P6 model 0-8,10-12		 -
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 8
+C AMD K6
+C AMD K7			 -
+C AMD K8
+C AMD K10
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n',  `%ecx')
+
+ifdef(`OPERATION_addmul_1',`
+	define(ADDSUB,  add)
+	define(func_1,  mpn_addmul_1)
+	define(func_1c, mpn_addmul_1c)')
+ifdef(`OPERATION_submul_1',`
+	define(ADDSUB,  sub)
+	define(func_1,  mpn_submul_1)
+	define(func_1c, mpn_submul_1c)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_1)
+	xor	%edx, %edx
+L(ent):	push	%edi
+	push	%esi
+	push	%ebx
+	mov	16(%esp), rp
+	mov	20(%esp), up
+	mov	24(%esp), n
+	movd	28(%esp), %mm7
+	test	$1, n
+	jz	L(fi0or2)
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	shr	$2, n
+	jnc	L(fi1)
+
+L(fi3):	lea	-8(up), up
+	lea	-8(rp), rp
+	movd	12(up), %mm1
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	add	$1, n			C increment and clear carry
+	jmp	L(lo3)
+
+L(fi1):	movd	%mm0, %ebx
+	jz	L(wd1)
+	movd	4(up), %mm1
+	pmuludq	%mm7, %mm1
+	jmp	L(lo1)
+
+L(fi0or2):
+	movd	(up), %mm1
+	pmuludq	%mm7, %mm1
+	shr	$2, n
+	movd	4(up), %mm0
+	jc	L(fi2)
+	lea	-4(up), up
+	lea	-4(rp), rp
+	movd	%mm1, %eax
+	pmuludq	%mm7, %mm0
+	jmp	L(lo0)
+
+L(fi2):	lea	4(up), up
+	add	$1, n			C increment and clear carry
+	movd	%mm1, %eax
+	lea	-12(rp), rp
+	jmp	L(lo2)
+
+C	ALIGN(16)			C alignment seems irrelevant
+L(top):	movd	4(up), %mm1
+	adc	$0, %edx
+	ADDSUB	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+L(lo1):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	ADDSUB	%ebx, (rp)
+L(lo0):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	ADDSUB	%eax, 4(rp)
+L(lo3):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	ADDSUB	%ebx, 8(rp)
+L(lo2):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	dec	n
+	jnz	L(top)
+
+L(end):	adc	n, %edx			C n is zero here
+	ADDSUB	%eax, 12(rp)
+	movd	%mm0, %ebx
+	lea	16(rp), rp
+L(wd1):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	n, %eax
+	ADDSUB	%ebx, (rp)
+	emms
+	adc	n, %eax
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
+PROLOGUE(func_1c)
+	mov	20(%esp), %edx		C carry
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm b/third_party/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000..782e914
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom  mpn_bdiv_dbm1.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_dbm1c)
+include_mpn(`x86/pentium4/sse2/bdiv_dbm1c.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/divrem_1.asm b/third_party/gmp/mpn/x86/atom/sse2/divrem_1.asm
new file mode 100644
index 0000000..f84709a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/divrem_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_preinv_divrem_1 mpn_divrem_1c mpn_divrem_1)
+include_mpn(`x86/pentium4/sse2/divrem_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mod_1_1.asm b/third_party/gmp/mpn/x86/atom/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ae6581d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mod_1_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom/SSE2 mpn_mod_1_1.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mod_1_4.asm b/third_party/gmp/mpn/x86/atom/sse2/mod_1_4.asm
new file mode 100644
index 0000000..31faa3f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mod_1_4.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom/SSE2 mpn_mod_1_4.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mul_1.asm b/third_party/gmp/mpn/x86/atom/sse2/mul_1.asm
new file mode 100644
index 0000000..aa3bb97
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mul_1.asm

@@ -0,0 +1,124 @@
+dnl  Intel Atom mpn_mul_1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C			    cycles/limb
+C P5				 -
+C P6 model 0-8,10-12		 -
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 7.5
+C AMD K6			 -
+C AMD K7			 -
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL,  16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+define(`rp', `%edx')
+define(`up', `%esi')
+define(`n',  `%ecx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_mul_1c)
+	movd	PARAM_CARRY, %mm6	C carry
+	jmp	L(ent)
+EPILOGUE()
+
+	ALIGN(8)			C for compact code
+PROLOGUE(mpn_mul_1)
+	pxor	%mm6, %mm6
+L(ent):	push	%esi			FRAME_pushl()
+	mov	PARAM_SRC, up
+	mov	PARAM_SIZE, %eax	C size
+	movd	PARAM_MUL, %mm7
+	movd	(up), %mm0
+	mov	%eax, n
+	and	$3, %eax
+	pmuludq	%mm7, %mm0
+	mov	PARAM_DST, rp
+	jz	L(lo0)
+	cmp	$2, %eax
+	lea	-16(up,%eax,4),up
+	lea	-16(rp,%eax,4),rp
+	jc	L(lo1)
+	jz	L(lo2)
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+L(lo0):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+L(lo3):	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+L(lo2):	paddq	%mm0, %mm6
+	movd	12(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 8(rp)
+	psrlq	$32, %mm6
+L(lo1):	paddq	%mm0, %mm6
+	sub	$4, n
+	movd	%mm6, 12(rp)
+	lea	16(up), up
+	ja	L(top)
+
+	psrlq	$32, %mm6
+	movd	%mm6, %eax
+	emms
+	pop	%esi			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mul_basecase.asm b/third_party/gmp/mpn/x86/atom/sse2/mul_basecase.asm
new file mode 100644
index 0000000..97d3aeb
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mul_basecase.asm

@@ -0,0 +1,501 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
+dnl  a third limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C    4 large loops into one; we could use it for the outer loop branch.
+C  * Optimise code outside of inner loops.
+C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C    outer each loop.  ("Overlapping software pipelining")
+C  * Postpone push of ebx until we know vn > 1.  Perhaps use caller-saves regs
+C    for inlined mul_1, allowing us to postpone all pushes.
+C  * Perhaps write special code for vn <= un < M, for some small M.
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xn,
+C                        mp_srcptr yp, mp_size_t yn);
+C
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`un',  `%ecx')
+define(`vp',  `%ebp')
+define(`vn',  `36(%esp)')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+	mov	20(%esp), rp
+	mov	24(%esp), up
+	mov	28(%esp), un
+	mov	32(%esp), vp
+
+	movd	(up), %mm0
+	movd	(vp), %mm7
+	pmuludq	%mm7, %mm0
+	pxor	%mm6, %mm6
+
+	mov	un, %eax
+	and	$3, %eax
+	jz	L(of0)
+	cmp	$2, %eax
+	jc	L(of1)
+	jz	L(of2)
+
+C ================================================================
+	jmp	L(m3)
+	ALIGN(16)
+L(lm3):	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(m3):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, 8(rp)
+	lea	16(up), up
+	ja	L(lm3)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 12(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-8(rp), rp
+
+L(ol3):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	16(rp,un,4), rp
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	sar	$2, un
+	movd	4(up), %mm1
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	-8(up), up
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a3)
+
+L(la3):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+L(a3):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la3)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol3)
+	jmp	L(done)
+
+C ================================================================
+	ALIGN(16)
+L(lm0):	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+L(of0):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	12(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, 12(rp)
+	lea	16(up), up
+	ja	L(lm0)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 16(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-4(rp), rp
+
+L(ol0):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	20(rp,un,4), rp
+
+	movd	(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	4(up), %mm0
+	lea	-4(up), up
+	movd	%mm1, %eax
+	pmuludq	%mm7, %mm0
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a0)
+
+L(la0):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+L(a0):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la0)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol0)
+	jmp	L(done)
+
+C ================================================================
+	ALIGN(16)
+L(lm1):	movd	-12(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -12(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of1):	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, (rp)
+	lea	16(up), up
+	ja	L(lm1)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 4(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-16(rp), rp
+
+L(ol1):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	24(rp,un,4), rp
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	sar	$2, un
+	movd	%mm0, %ebx
+	movd	4(up), %mm1
+	pmuludq	%mm7, %mm1
+	xor	%edx, %edx	C zero edx and CF
+	inc	un
+	jmp	L(a1)
+
+L(la1):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+L(a1):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la1)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol1)
+	jmp	L(done)
+
+C ================================================================
+	ALIGN(16)
+L(lm2):	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of2):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, 4(rp)
+	lea	16(up), up
+	ja	L(lm2)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 8(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-12(rp), rp
+
+L(ol2):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	12(rp,un,4), rp
+
+	movd	(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	4(up), %mm0
+	lea	4(up), up
+	movd	%mm1, %eax
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(lo2)
+
+L(la2):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+L(lo2):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la2)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol2)
+C	jmp	L(done)
+
+C ================================================================
+L(done):
+	emms
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/popcount.asm b/third_party/gmp/mpn/x86/atom/sse2/popcount.asm
new file mode 100644
index 0000000..7847aec
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/popcount.asm

@@ -0,0 +1,35 @@
+dnl  Intel Atom mpn_popcount -- population count.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/sqr_basecase.asm b/third_party/gmp/mpn/x86/atom/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..af19ed8
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/sqr_basecase.asm

@@ -0,0 +1,634 @@
+dnl  x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C    4 large loops into one; we could use it for the outer loop branch.
+C  * Optimise code outside of inner loops.
+C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C    outer each loop.  ("Overlapping software pipelining")
+C  * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
+C    all pushes.
+C  * Perhaps write special code for n < M, for some small M.
+C  * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
+C    with even less pipelined code.
+C  * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
+C    Consider breaking out earlier, saving high the cost of short loops.
+
+C void mpn_sqr_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xn);
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`n',   `%ecx')
+
+define(`un',  `%ebp')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	push	%edi
+	push	%esi
+	mov	12(%esp), rp
+	mov	16(%esp), up
+	mov	20(%esp), n
+
+	lea	4(rp), rp	C write triangular product starting at rp[1]
+	dec	n
+	movd	(up), %mm7
+
+	jz	L(one)
+	lea	4(up), up
+	push	%ebx
+	push	%ebp
+	mov	n, %eax
+
+	movd	(up), %mm0
+	neg	n
+	pmuludq	%mm7, %mm0
+	pxor	%mm6, %mm6
+	mov	n, un
+
+	and	$3, %eax
+	jz	L(of0)
+	cmp	$2, %eax
+	jc	L(of1)
+	jz	L(of2)
+
+C ================================================================
+	jmp	L(m3)
+	ALIGN(16)
+L(lm3):	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(m3):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, 8(rp)
+	lea	16(up), up
+	js	L(lm3)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 12(rp)
+
+	inc	n
+C	jz	L(done)
+  lea	-12(up), up
+  lea	4(rp), rp
+	jmp	L(ol2)
+
+C ================================================================
+	ALIGN(16)
+L(lm0):	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+L(of0):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	12(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, 12(rp)
+	lea	16(up), up
+	js	L(lm0)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 16(rp)
+
+	inc	n
+C	jz	L(done)
+  lea	-8(up), up
+  lea	8(rp), rp
+	jmp	L(ol3)
+
+C ================================================================
+	ALIGN(16)
+L(lm1):	movd	-12(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -12(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of1):	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, (rp)
+	lea	16(up), up
+	js	L(lm1)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 4(rp)
+
+	inc	n
+	jz	L(done)		C goes away when we add special n=2 code
+  lea	-20(up), up
+  lea	-4(rp), rp
+	jmp	L(ol0)
+
+C ================================================================
+	ALIGN(16)
+L(lm2):	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of2):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, 4(rp)
+	lea	16(up), up
+	js	L(lm2)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 8(rp)
+
+	inc	n
+C	jz	L(done)
+  lea	-16(up), up
+C  lea	(rp), rp
+C	jmp	L(ol1)
+
+C ================================================================
+
+L(ol1):	lea	4(up,n,4), up
+	movd	(up), %mm7	C read next U invariant limb
+	lea	8(rp,n,4), rp
+	mov	n, un
+
+	movd	4(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	%mm1, %ebx
+	inc	un
+	jz	L(re1)
+
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a1)
+
+L(la1):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+L(a1):	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la1)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+
+C ================================================================
+
+L(ol0):	lea	(up,n,4), up
+	movd	4(up), %mm7	C read next U invariant limb
+	lea	4(rp,n,4), rp
+	mov	n, un
+
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	sar	$2, un
+	movd	12(up), %mm1
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a0)
+
+L(la0):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+L(a0):	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la0)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+
+C ================================================================
+
+L(ol3):	lea	12(up,n,4), up
+	movd	-8(up), %mm7	C read next U invariant limb
+	lea	(rp,n,4), rp	C put rp back
+	mov	n, un
+
+	movd	-4(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	%mm1, %ebx
+	movd	(up), %mm0
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a3)
+
+L(la3):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+L(a3):	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la3)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+
+C ================================================================
+
+L(ol2):	lea	8(up,n,4), up
+	movd	-4(up), %mm7	C read next U invariant limb
+	lea	12(rp,n,4), rp
+	mov	n, un
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	xor	%edx, %edx
+	sar	$2, un
+	movd	4(up), %mm1
+	test	un, un		C clear carry
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	inc	un
+	jnz	L(a2)
+	jmp	L(re2)
+
+L(la2):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+L(a2):	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la2)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+	jmp	L(ol1)
+
+C ================================================================
+L(re2):	psrlq	$32, %mm0
+	movd	(up), %mm7	C read next U invariant limb
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	lea	4(rp), rp
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	movd	4(up), %mm1
+	adc	un, %eax
+	add	%ebx, (rp)
+	pmuludq	%mm7, %mm1
+	adc	un, %eax
+	mov	%eax, 4(rp)
+	movd	%mm1, %ebx
+
+L(re1):	psrlq	$32, %mm1
+	add	%ebx, 4(rp)
+	movd	%mm1, %eax
+	adc	un, %eax
+	xor	n, n		C make n zeroness assumption below true
+	mov	%eax, 8(rp)
+
+L(done):			C n is zero here
+	mov	24(%esp), up
+	mov	28(%esp), %eax
+
+	movd	(up), %mm0
+	inc	%eax
+	pmuludq	%mm0, %mm0
+	lea	4(up), up
+	mov	20(%esp), rp
+	shr	%eax
+	movd	%mm0, (rp)
+	psrlq	$32, %mm0
+	lea	-12(rp), rp
+	mov	%eax, 28(%esp)
+	jnc	L(odd)
+
+	movd	%mm0, %ebp
+	movd	(up), %mm0
+	lea	8(rp), rp
+	pmuludq	%mm0, %mm0
+	lea	-4(up), up
+	add	8(rp), %ebp
+	movd	%mm0, %edx
+	adc	12(rp), %edx
+	rcr	n
+	jmp	L(ent)
+
+C	ALIGN(16)		C alignment seems irrelevant
+L(top):	movd	(up), %mm1
+	adc	n, n
+	movd	%mm0, %eax
+	pmuludq	%mm1, %mm1
+	movd	4(up), %mm0
+	adc	(rp), %eax
+	movd	%mm1, %ebx
+	pmuludq	%mm0, %mm0
+	psrlq	$32, %mm1
+	adc	4(rp), %ebx
+	movd	%mm1, %ebp
+	movd	%mm0, %edx
+	adc	8(rp), %ebp
+	adc	12(rp), %edx
+	rcr	n		C FIXME: isn't this awfully slow on atom???
+	adc	%eax, (rp)
+	adc	%ebx, 4(rp)
+L(ent):	lea	8(up), up
+	adc	%ebp, 8(rp)
+	psrlq	$32, %mm0
+	adc	%edx, 12(rp)
+L(odd):	decl	28(%esp)
+	lea	16(rp), rp
+	jnz	L(top)
+
+L(end):	adc	n, n
+	movd	%mm0, %eax
+	adc	n, %eax
+	mov	%eax, (rp)
+
+L(rtn):	emms
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+
+L(one):	pmuludq	%mm7, %mm7
+	movq	%mm7, -4(rp)
+	emms
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/sublsh1_n.asm b/third_party/gmp/mpn/x86/atom/sublsh1_n.asm
new file mode 100644
index 0000000..d3e7e5b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sublsh1_n.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
+include_mpn(`x86/k7/sublsh1_n.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sublsh2_n.asm b/third_party/gmp/mpn/x86/atom/sublsh2_n.asm
new file mode 100644
index 0000000..79405cf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sublsh2_n.asm

@@ -0,0 +1,57 @@
+dnl  Intel Atom mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+	define(M4_inst,		adcl)
+	define(M4_opp,		subl)
+	define(M4_function,	mpn_addlsh2_n)
+	define(M4_function_c,	mpn_addlsh2_nc)
+	define(M4_ip_function_c, mpn_addlsh2_nc_ip1)
+	define(M4_ip_function,	mpn_addlsh2_n_ip1)
+',`ifdef(`OPERATION_sublsh2_n', `
+	define(M4_inst,		sbbl)
+	define(M4_opp,		addl)
+	define(M4_function,	mpn_sublsh2_n)
+	define(M4_function_c,	mpn_sublsh2_nc)
+	define(M4_ip_function_c, mpn_sublsh2_nc_ip1)
+	define(M4_ip_function,	mpn_sublsh2_n_ip1)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_sublsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_sublsh2_n_ip1 mpn_sublsh2_nc_ip1)
+
+include_mpn(`x86/atom/aorslshC_n.asm')

diff --git a/third_party/gmp/mpn/x86/bd1/gmp-mparam.h b/third_party/gmp/mpn/x86/bd1/gmp-mparam.h
new file mode 100644
index 0000000..254cfea
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bd1/gmp-mparam.h

@@ -0,0 +1,211 @@
+/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-3800 MHz Bulldozer Zambezi */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-27, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 59.59% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           27
+
+#define DIV_1_VS_MUL_1_PERCENT             245
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                89
+#define MUL_TOOM44_THRESHOLD               154
+#define MUL_TOOM6H_THRESHOLD               230
+#define MUL_TOOM8H_THRESHOLD               351
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     101
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     111
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 46
+#define SQR_TOOM3_THRESHOLD                 87
+#define SQR_TOOM4_THRESHOLD                216
+#define SQR_TOOM6_THRESHOLD                294
+#define SQR_TOOM8_THRESHOLD                442
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               22
+#define SQRMOD_BNM1_THRESHOLD               26
+
+#define MUL_FFT_MODF_THRESHOLD             636  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    636, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63, 7}, {   1023, 8}, {    543,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335,11}, {    191,10}, \
+    {    399,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    639,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    895,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,12}, {    895,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,10}, {   4479,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,11}, \
+    {   4479,13}, {   1151,12}, {   2495,11}, {   4991,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,12}, {   4991,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3327,13}, \
+    {   6911,14}, {   3839,13}, {   7935,16} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    565, 5}, {     29, 6}, {     15, 5}, {     32, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    159,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,11}, {    191,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1727,12}, {    895,11}, \
+    {   1791,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,10}, {   4479,12}, {   1215,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2239,11}, {   4479,13}, {   1151,12}, \
+    {   2495,11}, {   4991,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,12}, {   4991,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3327,13}, {   6783,14}, {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 152
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  31
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  33
+#define SQRLO_SQR_THRESHOLD              11278
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             198
+#define DC_BDIV_QR_THRESHOLD                48
+#define DC_BDIV_Q_THRESHOLD                126
+
+#define INV_MULMOD_BNM1_THRESHOLD           82
+#define INV_NEWTON_THRESHOLD               212
+#define INV_APPR_THRESHOLD                 202
+
+#define BINV_NEWTON_THRESHOLD              238
+#define REDC_1_TO_REDC_N_THRESHOLD          55
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              110
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1528
+
+#define POWM_SEC_TABLE  1,20,96,386,1221,2698
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               100
+#define SET_STR_PRECOMPUTE_THRESHOLD       762
+
+#define FAC_DSC_THRESHOLD                  118
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    4  /* 1.22% faster than 3 */
+#define HGCD_THRESHOLD                      67
+#define HGCD_APPR_THRESHOLD                150
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   483
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   4  /* 5.07% faster than 1 */
+
+/* Tuneup completed successfully, took 65358 seconds */

diff --git a/third_party/gmp/mpn/x86/bd2/gmp-mparam.h b/third_party/gmp/mpn/x86/bd2/gmp-mparam.h
new file mode 100644
index 0000000..6893da7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bd2/gmp-mparam.h

@@ -0,0 +1,214 @@
+/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 4000-4200 MHz Piledriver Vishera  */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 40.87% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           24
+
+#define DIV_1_VS_MUL_1_PERCENT             254
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               151
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               351
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     100
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     110
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 44
+#define SQR_TOOM3_THRESHOLD                 93
+#define SQR_TOOM4_THRESHOLD                212
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                466
+
+#define MULMID_TOOM42_THRESHOLD             66
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             595  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    595, 5}, {     27, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    143, 7}, {   1215, 9}, \
+    {    319, 8}, {    639, 9}, {    335, 8}, {    671, 9}, \
+    {    351,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    271,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335,11}, {    191,10}, {    399,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    895,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,13}, {    639,12}, \
+    {   1471,11}, {   2943,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2431,13}, {   1279,12}, \
+    {   2623,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1535,12}, {   3135,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,11}, {  15871,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,13}, {   7935,12}, {  15871,16} }
+#define MUL_FFT_TABLE3_SIZE 155
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             555  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    555, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,10}, {    191, 6}, {   3071, 5}, {   6399, 6}, \
+    {   3455, 7}, {   1791, 8}, {    959,10}, {    255, 9}, \
+    {    511,10}, {    271,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    927,13}, \
+    {    255,12}, {    511,11}, {   1055,10}, {   2111,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2111,12}, {   1087,11}, {   2239,10}, {   4479,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1855,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,12}, {   7935,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,13}, {   7935,16} }
+#define SQR_FFT_TABLE3_SIZE 166
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  34
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                  43
+#define SQRLO_SQR_THRESHOLD              11278
+
+#define DC_DIV_QR_THRESHOLD                 75
+#define DC_DIVAPPR_Q_THRESHOLD             200
+#define DC_BDIV_QR_THRESHOLD                71
+#define DC_BDIV_Q_THRESHOLD                119
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               266
+#define INV_APPR_THRESHOLD                 214
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_N_THRESHOLD          71
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1597
+
+#define POWM_SEC_TABLE  1,22,96,289,1259
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               173
+#define SET_STR_PRECOMPUTE_THRESHOLD       454
+
+#define FAC_DSC_THRESHOLD                   90
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 5.80% faster than 3 */
+#define HGCD_THRESHOLD                      74
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   456
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   4  /* 17.07% faster than 1 */
+
+/* Tuneup completed successfully, took 53914 seconds */

diff --git a/third_party/gmp/mpn/x86/bd4/gmp-mparam.h b/third_party/gmp/mpn/x86/bd4/gmp-mparam.h
new file mode 100644
index 0000000..6c20d0f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bd4/gmp-mparam.h

@@ -0,0 +1,225 @@
+/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3800-4200 MHz Excavator/Bristol Ridge  */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        27
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        50
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 28.45% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              13
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           28
+
+#define DIV_1_VS_MUL_1_PERCENT             314
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      69
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     103
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     121
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     154
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 42
+#define SQR_TOOM3_THRESHOLD                 89
+#define SQR_TOOM4_THRESHOLD                208
+#define SQR_TOOM6_THRESHOLD                306
+#define SQR_TOOM8_THRESHOLD                454
+
+#define MULMID_TOOM42_THRESHOLD             68
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    570, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    143, 6}, {   2303, 5}, \
+    {   4735, 4}, {   9471, 5}, {   4863, 7}, {   1279, 9}, \
+    {    335, 8}, {    671, 9}, {    351, 8}, {    703,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671, 8}, \
+    {   1343,10}, {    351, 9}, {    703,10}, {    367, 9}, \
+    {    735,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799, 8}, {   1599,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671, 9}, {   1343,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    863,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215, 9}, {   2431,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471, 9}, {   2943,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,10}, {   1919,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,10}, {   2431,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,10}, \
+    {   2943,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,10}, {   3455,12}, {    959,11}, \
+    {   1919,10}, {   3839,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,11}, {   2431,13}, {    639,12}, \
+    {   1471,11}, {   2943,10}, {   5887,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2559,13}, \
+    {   1407,12}, {   2943,11}, {   5887,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3839,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,12}, \
+    {   7935,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,13}, {   7935,16} }
+#define MUL_FFT_TABLE3_SIZE 192
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    476, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351, 9}, {    735,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415, 9}, {    863,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671, 9}, {   1343,11}, {    351,10}, {    735,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    863,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1055,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2559,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,12}, {   3839,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 176
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  54
+#define MULLO_MUL_N_THRESHOLD            10950
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  77
+#define SQRLO_SQR_THRESHOLD               9449
+
+#define DC_DIV_QR_THRESHOLD                 84
+#define DC_DIVAPPR_Q_THRESHOLD             252
+#define DC_BDIV_QR_THRESHOLD                79
+#define DC_BDIV_Q_THRESHOLD                 80
+
+#define INV_MULMOD_BNM1_THRESHOLD           71
+#define INV_NEWTON_THRESHOLD               254
+#define INV_APPR_THRESHOLD                 266
+
+#define BINV_NEWTON_THRESHOLD              294
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1387
+#define MU_BDIV_Q_THRESHOLD               1528
+
+#define POWM_SEC_TABLE  1,16,96,480,960
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               264
+#define SET_STR_PRECOMPUTE_THRESHOLD       542
+
+#define FAC_DSC_THRESHOLD                   91
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 9.73% faster than 3 */
+#define HGCD_THRESHOLD                      55
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   562
+#define GCDEXT_DC_THRESHOLD                416
+#define JACOBI_BASE_METHOD                   4  /* 16.50% faster than 1 */
+
+/* Tuneup completed successfully, took 49179 seconds */

diff --git a/third_party/gmp/mpn/x86/bdiv_dbm1c.asm b/third_party/gmp/mpn/x86/bdiv_dbm1c.asm
new file mode 100644
index 0000000..0288c47
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bdiv_dbm1c.asm

@@ -0,0 +1,129 @@
+dnl  x86 mpn_bdiv_dbm1.
+
+dnl  Copyright 2008, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12)
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 5.1
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	13.67
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom
+C AMD K6
+C AMD K7			 3.5
+C AMD K8
+C AMD K10
+
+
+C TODO
+C  * Optimize for more x86 processors
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	mov	16(%esp), %ecx		C d
+	push	%esi
+	mov	12(%esp), %esi		C ap
+	push	%edi
+	mov	12(%esp), %edi		C qp
+	push	%ebp
+	mov	24(%esp), %ebp		C n
+	push	%ebx
+
+	mov	(%esi), %eax
+	mul	%ecx
+	mov	36(%esp), %ebx
+	sub	%eax, %ebx
+	mov	%ebx, (%edi)
+	sbb	%edx, %ebx
+
+	mov	%ebp, %eax
+	and	$3, %eax
+	jz	L(b0)
+	cmp	$2, %eax
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	lea	-8(%esi), %esi
+	lea	8(%edi), %edi
+	add	$-3, %ebp
+	jmp	L(3)
+
+L(b0):	mov	4(%esi), %eax
+	lea	-4(%esi), %esi
+	lea	12(%edi), %edi
+	add	$-4, %ebp
+	jmp	L(0)
+
+L(b2):	mov	4(%esi), %eax
+	lea	4(%esi), %esi
+	lea	4(%edi), %edi
+	add	$-2, %ebp
+	jmp	L(2)
+
+	ALIGN(8)
+L(top):	mov	4(%esi), %eax
+	mul	%ecx
+	lea	16(%edi), %edi
+	sub	%eax, %ebx
+	mov	8(%esi), %eax
+	mov	%ebx, -12(%edi)
+	sbb	%edx, %ebx
+L(0):	mul	%ecx
+	sub	%eax, %ebx
+	mov	%ebx, -8(%edi)
+	sbb	%edx, %ebx
+L(3):	mov	12(%esi), %eax
+	mul	%ecx
+	sub	%eax, %ebx
+	mov	%ebx, -4(%edi)
+	mov	16(%esi), %eax
+	lea	16(%esi), %esi
+	sbb	%edx, %ebx
+L(2):	mul	%ecx
+	sub	%eax, %ebx
+	mov	%ebx, 0(%edi)
+	sbb	%edx, %ebx
+L(b1):	add	$-4, %ebp
+	jns	L(top)
+
+	mov	%ebx, %eax
+	pop	%ebx
+	pop	%ebp
+	pop	%edi
+	pop	%esi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/bdiv_q_1.asm b/third_party/gmp/mpn/x86/bdiv_q_1.asm
new file mode 100644
index 0000000..132de06
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bdiv_q_1.asm

@@ -0,0 +1,208 @@
+dnl  x86 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54    30.0
+C P55    29.0
+C P6     13.0 odd divisor, 12.0 even (strangely)
+C K6     14.0
+C K7     12.0
+C P4     42.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+
+	ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SHIFT, %ecx
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_INVERSE, %eax
+	movl	PARAM_SIZE, %ebp
+	pushl	%ebx	FRAME_pushl()
+L(common):
+	pushl	%edi	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	xorl	%ebx, %ebx
+	xorl	%edx, %edx
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	VAR_INVERSE, %edx
+	jmp	L(entry)
+
+
+	ALIGN(8)
+	nop	C k6 code alignment
+	nop
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or -1
+	C ecx	shift
+	C edx	carry limb
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	movl	-4(%esi,%ebp,4), %eax
+	subl	%ebx, %edx		C accumulate carry bit
+
+	movl	(%esi,%ebp,4), %ebx
+
+	shrdl(	%cl, %ebx, %eax)
+
+	subl	%edx, %eax		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebx, %ebx
+
+L(entry):
+	imull	%edx, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	movl	PARAM_DIVISOR, %edx
+
+	mull	%edx
+
+	incl	%ebp
+	jnz	L(top)
+
+
+	movl	-4(%esi), %eax		C src high limb
+L(one):
+	shrl	%cl, %eax
+	popl	%esi	FRAME_popl()
+
+	addl	%ebx, %eax		C apply carry bit
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+
+	popl	%edi
+	popl	%ebx
+	popl	%ebp
+
+	ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	pushl	%ebp	FRAME_pushl()
+
+	movl	$-1, %ecx		C shift count
+	movl	PARAM_SIZE, %ebp
+
+	pushl	%ebx	FRAME_pushl()
+
+L(strip_twos):
+	incl	%ecx
+
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+	imull	%eax, %eax		C inv*inv
+	imull	%ebx, %eax		C inv*inv*d
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+
+	leal	(%edx,%edx), %eax	C 2*inv
+	imull	%edx, %edx		C inv*inv
+	imull	%ebx, %edx		C inv*inv*d
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/bt1/gmp-mparam.h b/third_party/gmp/mpn/x86/bt1/gmp-mparam.h
new file mode 100644
index 0000000..302dbc6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bt1/gmp-mparam.h

@@ -0,0 +1,218 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than
+   the value in mpn/x86/k7/gmp-mparam.h.  The latter is used as a hard limit in
+   k7/sqr_basecase.asm.  */
+
+/* 1600 MHz AMD Bobcat Zacate E-350 */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        16
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     21
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 57.16% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           36
+
+#define DIV_1_VS_MUL_1_PERCENT             199
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     102
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     177
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     169
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     143
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 50
+#define SQR_TOOM3_THRESHOLD                 89
+#define SQR_TOOM4_THRESHOLD                248
+#define SQR_TOOM6_THRESHOLD                342
+#define SQR_TOOM8_THRESHOLD                470
+
+#define MULMID_TOOM42_THRESHOLD             72
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               21
+
+#define MUL_FFT_MODF_THRESHOLD             630  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    630, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     27, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     55, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 6}, \
+    {    767, 7}, {    399, 6}, {    799, 7}, {    415, 8}, \
+    {    235, 7}, {    479, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1407,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             500  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    500, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    127, 6}, {   1087, 7}, {    575, 8}, {    303, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415, 9}, {    831,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             9
+#define MULLO_DC_THRESHOLD                  48
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             7
+#define SQRLO_DC_THRESHOLD                 146
+#define SQRLO_SQR_THRESHOLD              11278
+
+#define DC_DIV_QR_THRESHOLD                 77
+#define DC_DIVAPPR_Q_THRESHOLD             240
+#define DC_BDIV_QR_THRESHOLD                83
+#define DC_BDIV_Q_THRESHOLD                182
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               252
+#define INV_APPR_THRESHOLD                 252
+
+#define BINV_NEWTON_THRESHOLD              252
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1787
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1713
+
+#define POWM_SEC_TABLE  1,16,96,563,1317,1867
+
+#define GET_STR_DC_THRESHOLD                19
+#define GET_STR_PRECOMPUTE_THRESHOLD        32
+#define SET_STR_DC_THRESHOLD               254
+#define SET_STR_PRECOMPUTE_THRESHOLD       907
+
+#define FAC_DSC_THRESHOLD                  224
+#define FAC_ODD_THRESHOLD                   55
+
+#define MATRIX22_STRASSEN_THRESHOLD         23
+#define HGCD2_DIV1_METHOD                    3  /* 3.59% faster than 5 */
+#define HGCD_THRESHOLD                      85
+#define HGCD_APPR_THRESHOLD                152
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   531
+#define GCDEXT_DC_THRESHOLD                386
+#define JACOBI_BASE_METHOD                   3  /* 0.92% faster than 1 */
+
+/* Tuneup completed successfully, took 159946 seconds */

diff --git a/third_party/gmp/mpn/x86/bt2/gmp-mparam.h b/third_party/gmp/mpn/x86/bt2/gmp-mparam.h
new file mode 100644
index 0000000..f936cb7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bt2/gmp-mparam.h

@@ -0,0 +1,214 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than
+   the value in mpn/x86/k7/gmp-mparam.h.  The latter is used as a hard limit in
+   k7/sqr_basecase.asm.  */
+
+/* 2050 MHz AMD Jaguar/Kabini */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 4
+#define MOD_1_UNNORM_THRESHOLD               6
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 47.53% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           27
+
+#define DIV_1_VS_MUL_1_PERCENT             243
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                90
+#define MUL_TOOM44_THRESHOLD               154
+#define MUL_TOOM6H_THRESHOLD               286
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     152
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     103
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     154
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 38
+#define SQR_TOOM3_THRESHOLD                126
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                502
+
+#define MULMID_TOOM42_THRESHOLD             68
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               25
+
+#define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    570, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     28, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    991,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 153
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             530  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    530, 5}, {     27, 6}, {     15, 5}, {     31, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     95,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    991,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2495,13}, \
+    {   1407,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 151
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             8
+#define MULLO_DC_THRESHOLD                  44
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD            13
+#define SQRLO_DC_THRESHOLD                  62
+#define SQRLO_SQR_THRESHOLD               8907
+
+#define DC_DIV_QR_THRESHOLD                 79
+#define DC_DIVAPPR_Q_THRESHOLD             228
+#define DC_BDIV_QR_THRESHOLD                75
+#define DC_BDIV_Q_THRESHOLD                136
+
+#define INV_MULMOD_BNM1_THRESHOLD           90
+#define INV_NEWTON_THRESHOLD               260
+#define INV_APPR_THRESHOLD                 236
+
+#define BINV_NEWTON_THRESHOLD              294
+#define REDC_1_TO_REDC_N_THRESHOLD          80
+
+#define MU_DIV_QR_THRESHOLD               1787
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              118
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1652
+
+#define POWM_SEC_TABLE  1,16,96,615,865,1442
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               252
+#define SET_STR_PRECOMPUTE_THRESHOLD       638
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   39
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 13.65% faster than 3 */
+#define HGCD_THRESHOLD                      81
+#define HGCD_APPR_THRESHOLD                 66
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   531
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   1  /* 0.84% faster than 4 */
+
+/* Tuneup completed successfully, took 103818 seconds */

diff --git a/third_party/gmp/mpn/x86/cnd_aors_n.asm b/third_party/gmp/mpn/x86/cnd_aors_n.asm
new file mode 100644
index 0000000..74f4917
--- /dev/null
+++ b/third_party/gmp/mpn/x86/cnd_aors_n.asm

@@ -0,0 +1,124 @@
+dnl  X86 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 5.4
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	14.5
+C P4 model 3-4 (Prescott)	21
+C Intel atom			11
+C AMD K6			 ?
+C AMD K7			 3.4
+C AMD K8			 ?
+
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+define(`n',   `%ecx')
+define(`cnd', `20(%esp)')
+define(`cy',  `%edx')
+
+ifdef(`OPERATION_cnd_add_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	C make cnd into a full mask
+	mov	cnd, %eax
+	neg	%eax
+	sbb	%eax, %eax
+	mov	%eax, cnd
+
+	C load parameters into registers
+	mov	24(%esp), rp
+	mov	28(%esp), up
+	mov	32(%esp), vp
+	mov	36(%esp), n
+
+	mov	(vp), %eax
+	mov	(up), %ebx
+
+	C put operand pointers just beyond their last limb
+	lea	(vp,n,4), vp
+	lea	(up,n,4), up
+	lea	-4(rp,n,4), rp
+	neg	n
+
+	and	cnd, %eax
+	ADDSUB	%eax, %ebx
+	sbb	cy, cy
+	inc	n
+	je	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp,n,4), %eax
+	and	cnd, %eax
+	mov	%ebx, (rp,n,4)
+	mov	(up,n,4), %ebx
+	add	cy, cy
+	ADCSBB	%eax, %ebx
+	sbb	cy, cy
+	inc	n
+	jne	L(top)
+
+L(end):	mov	%ebx, (rp)
+	xor	%eax, %eax
+	sub	cy, %eax
+
+	mov	(%esp), %ebp
+	mov	4(%esp), %ebx
+	mov	8(%esp), %esi
+	mov	12(%esp), %edi
+	add	$16, %esp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/copyd.asm b/third_party/gmp/mpn/x86/copyd.asm
new file mode 100644
index 0000000..51fa195
--- /dev/null
+++ b/third_party/gmp/mpn/x86/copyd.asm

@@ -0,0 +1,91 @@
+dnl  x86 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb  startup (approx)
+C P5	  1.0	      40
+C P6	  2.4	      70
+C K6	  1.0	      55
+C K7	  1.3	      75
+C P4	  2.6	     175
+C
+C (Startup time includes some function call overheads.)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from high to low addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+
+	std
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/copyi.asm b/third_party/gmp/mpn/x86/copyi.asm
new file mode 100644
index 0000000..f6b0354
--- /dev/null
+++ b/third_party/gmp/mpn/x86/copyi.asm

@@ -0,0 +1,99 @@
+dnl  x86 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb  startup (approx)
+C P5	  1.0	      35
+C P6	  0.75	      45
+C K6	  1.0	      30
+C K7	  1.3	      65
+C P4	  1.0	     120
+C
+C (Startup time includes some function call overheads.)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from low to high addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P6 -  An MMX based copy was tried, but was found to be slower than a rep
+C       movs in all cases.  The fastest MMX found was 0.8 cycles/limb (when
+C       fully aligned).  A rep movs seems to have a startup time of about 15
+C       cycles, but doing something special for small sizes could lead to a
+C       branch misprediction that would destroy any saving.  For now a plain
+C       rep movs seems ok.
+C
+C K62 - We used to have a big chunk of code doing an MMX copy at 0.56 c/l if
+C       aligned or a 1.0 rep movs if not.  But that seemed excessive since
+C       it only got an advantage half the time, and even then only showed it
+C       above 50 limbs or so.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+PROLOGUE(mpn_copyi)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/core2/gmp-mparam.h b/third_party/gmp/mpn/x86/core2/gmp-mparam.h
new file mode 100644
index 0000000..8a44ad1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/core2/gmp-mparam.h

@@ -0,0 +1,210 @@
+/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3000 MHz Penryn */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD             MP_SIZE_T_MAX  /* never */
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      3
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 2  /* 22.20% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD               9
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           18
+
+#define DIV_1_VS_MUL_1_PERCENT             277
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               136
+#define MUL_TOOM6H_THRESHOLD               300
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      91
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                184
+#define SQR_TOOM6_THRESHOLD                262
+#define SQR_TOOM8_THRESHOLD                597
+
+#define MULMID_TOOM42_THRESHOLD             70
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               25
+
+#define MUL_FFT_MODF_THRESHOLD             505  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    505, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63, 9}, {    255,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    335, 9}, \
+    {    671,10}, {    351,11}, {    191,10}, {    399, 9}, \
+    {    799,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,12}, \
+    {   7935,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 147
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             464  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    464, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    127,10}, {     79, 9}, {    159,10}, \
+    {     95,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 5}, {   4863, 6}, {   2495, 7}, \
+    {   1343, 8}, {    703, 9}, {    367,12}, {     63,11}, \
+    {    127,10}, {    303,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351, 9}, \
+    {    703,10}, {    367,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    863,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 157
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  36
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 140
+#define SQRLO_SQR_THRESHOLD              10393
+
+#define DC_DIV_QR_THRESHOLD                 32
+#define DC_DIVAPPR_Q_THRESHOLD             116
+#define DC_BDIV_QR_THRESHOLD                76
+#define DC_BDIV_Q_THRESHOLD                180
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               138
+#define INV_APPR_THRESHOLD                 123
+
+#define BINV_NEWTON_THRESHOLD              306
+#define REDC_1_TO_REDC_N_THRESHOLD          82
+
+#define MU_DIV_QR_THRESHOLD               1499
+#define MU_DIVAPPR_Q_THRESHOLD            1442
+#define MUPI_DIV_QR_THRESHOLD               63
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1589
+
+#define POWM_SEC_TABLE  1,22,66,428,1035
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        18
+#define SET_STR_DC_THRESHOLD               732
+#define SET_STR_PRECOMPUTE_THRESHOLD      1118
+
+#define FAC_DSC_THRESHOLD                  115
+#define FAC_ODD_THRESHOLD                   50
+
+#define MATRIX22_STRASSEN_THRESHOLD         25
+#define HGCD2_DIV1_METHOD                    1  /* 5.78% faster than 3 */
+#define HGCD_THRESHOLD                     121
+#define HGCD_APPR_THRESHOLD                151
+#define HGCD_REDUCE_THRESHOLD             3259
+#define GCD_DC_THRESHOLD                   368
+#define GCDEXT_DC_THRESHOLD                306
+#define JACOBI_BASE_METHOD                   4  /* 14.19% faster than 1 */
+
+/* Tuneup completed successfully, took 67142 seconds */

diff --git a/third_party/gmp/mpn/x86/coreibwl/gmp-mparam.h b/third_party/gmp/mpn/x86/coreibwl/gmp-mparam.h
new file mode 100644
index 0000000..7b58cad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreibwl/gmp-mparam.h

@@ -0,0 +1,216 @@
+/* x86/coreibwl gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                15
+#define MOD_1_UNNORM_THRESHOLD              16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 21.34% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             14
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              29
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           19
+
+#define DIV_1_VS_MUL_1_PERCENT             295
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                97
+#define MUL_TOOM44_THRESHOLD               220
+#define MUL_TOOM6H_THRESHOLD               306
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     169
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 44
+#define SQR_TOOM3_THRESHOLD                134
+#define SQR_TOOM4_THRESHOLD                242
+#define SQR_TOOM6_THRESHOLD                342
+#define SQR_TOOM8_THRESHOLD                502
+
+#define MULMID_TOOM42_THRESHOLD             98
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             540  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    540, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     17, 5}, {     36, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     43, 9}, {     23, 8}, {     55,10}, \
+    {     15, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     83, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95, 7}, {   1599, 8}, {    831, 9}, {    431, 8}, \
+    {    863, 9}, {    447,10}, {    239, 9}, {    479,10}, \
+    {    255, 9}, {    511,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    511, 9}, \
+    {   1023,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1119,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1119,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2815,14}, {    767,13}, {   1535,12}, \
+    {   3135,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3839,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2559,13}, {   5247,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 172
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             472  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    472, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     37, 7}, {     19, 6}, {     40, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     83, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    271, 9}, {    543, 6}, {   4479, 7}, {   2431, 8}, \
+    {   1247, 7}, {   2495, 8}, {   1279,10}, {    351,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    639,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    927,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1663,12}, \
+    {    895,11}, {   1855,14}, {    255,13}, {    511,12}, \
+    {   1023,11}, {   2047,12}, {   1087,11}, {   2239,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1663,13}, {    895,12}, {   1983,14}, {    511,13}, \
+    {   1023,12}, {   2239,13}, {   1151,12}, {   2495,13}, \
+    {   1279,12}, {   2623,13}, {   1407,14}, {    767,13}, \
+    {   1535,12}, {   3135,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3839,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3327,13}, {   6783,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 157
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD            16
+#define MULLO_DC_THRESHOLD                  37
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 137
+#define SQRLO_SQR_THRESHOLD              10821
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             146
+#define DC_BDIV_QR_THRESHOLD                98
+#define DC_BDIV_Q_THRESHOLD                218
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               173
+#define INV_APPR_THRESHOLD                 165
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1787
+#define MU_DIVAPPR_Q_THRESHOLD            1787
+#define MUPI_DIV_QR_THRESHOLD               78
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1830
+
+#define POWM_SEC_TABLE  1,16,126,416,932
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        17
+#define SET_STR_DC_THRESHOLD               306
+#define SET_STR_PRECOMPUTE_THRESHOLD       894
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    3  /* 5.97% faster than 1 */
+#define HGCD_THRESHOLD                      73
+#define HGCD_APPR_THRESHOLD                123
+#define HGCD_REDUCE_THRESHOLD             3664
+#define GCD_DC_THRESHOLD                   562
+#define GCDEXT_DC_THRESHOLD                465
+#define JACOBI_BASE_METHOD                   1  /* 31.16% faster than 3 */
+
+/* Tuneup completed successfully, took 35114 seconds */

diff --git a/third_party/gmp/mpn/x86/coreihwl/gmp-mparam.h b/third_party/gmp/mpn/x86/coreihwl/gmp-mparam.h
new file mode 100644
index 0000000..ea4ac11
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreihwl/gmp-mparam.h

@@ -0,0 +1,215 @@
+/* x86/coreihwl gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                17
+#define MOD_1_UNNORM_THRESHOLD              17
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      5
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 11.44% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             13
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define DIV_1_VS_MUL_1_PERCENT             296
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD               108
+#define MUL_TOOM44_THRESHOLD               232
+#define MUL_TOOM6H_THRESHOLD               306
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     113
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 44
+#define SQR_TOOM3_THRESHOLD                141
+#define SQR_TOOM4_THRESHOLD                384
+#define SQR_TOOM6_THRESHOLD                517
+#define SQR_TOOM8_THRESHOLD                698
+
+#define MULMID_TOOM42_THRESHOLD             98
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    565, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 7}, {     55, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     43, 9}, {     23, 8}, {     55, 9}, \
+    {     31, 8}, {     71, 9}, {     39, 8}, {     83, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95, 9}, {    191,10}, {    111,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191, 6}, {   3199, 7}, {   1727, 9}, {    447,10}, \
+    {    239, 9}, {    479,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    511, 9}, \
+    {   1023,10}, {    527,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,11}, {   1919,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2431,13}, {   1279,12}, \
+    {   2623,13}, {   1407,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,14}, {   2559,13}, \
+    {   5375,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 165
+#define MUL_FFT_THRESHOLD                 7808
+
+#define SQR_FFT_MODF_THRESHOLD             560  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    560, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     17, 5}, {     36, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     40, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     55,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,12}, {     63,11}, {    127, 9}, {    511, 5}, \
+    {   8959, 7}, {   2431, 8}, {   1247, 7}, {   2495, 8}, \
+    {   1279, 9}, {    671,10}, {    367,11}, {    191,10}, \
+    {    399, 9}, {    799,10}, {    415,12}, {    127,11}, \
+    {    255,10}, {    527,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,11}, {    543,10}, {   1119,11}, \
+    {    607,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,12}, {    383,11}, {    863,12}, {    447,11}, \
+    {    991,12}, {    511,11}, {   1119,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,11}, {   1983,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1983,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2623,13}, \
+    {   1407,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2559,13}, \
+    {   5119,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3327,13}, {   6911,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 159
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD            17
+#define MULLO_DC_THRESHOLD                  40
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 141
+#define SQRLO_SQR_THRESHOLD              10821
+
+#define DC_DIV_QR_THRESHOLD                 30
+#define DC_DIVAPPR_Q_THRESHOLD             190
+#define DC_BDIV_QR_THRESHOLD                67
+#define DC_BDIV_Q_THRESHOLD                254
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               157
+#define INV_APPR_THRESHOLD                 163
+
+#define BINV_NEWTON_THRESHOLD              236
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1895
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD               54
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1898
+
+#define POWM_SEC_TABLE  1,16,95,480,1442
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               372
+#define SET_STR_PRECOMPUTE_THRESHOLD      1037
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    3  /* 6.26% faster than 1 */
+#define HGCD_THRESHOLD                      70
+#define HGCD_APPR_THRESHOLD                129
+#define HGCD_REDUCE_THRESHOLD             3664
+#define GCD_DC_THRESHOLD                   573
+#define GCDEXT_DC_THRESHOLD                483
+#define JACOBI_BASE_METHOD                   1  /* 27.01% faster than 3 */
+
+/* Tuneup completed successfully, took 35232 seconds */

diff --git a/third_party/gmp/mpn/x86/coreinhm/gmp-mparam.h b/third_party/gmp/mpn/x86/coreinhm/gmp-mparam.h
new file mode 100644
index 0000000..4428b4b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreinhm/gmp-mparam.h

@@ -0,0 +1,223 @@
+/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2933-3200 MHz Intel Xeon X3470 Nehalem */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                36
+#define MOD_1_UNNORM_THRESHOLD              40
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      3
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 42.59% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD               9
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           17
+
+#define DIV_1_VS_MUL_1_PERCENT             288
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               214
+#define MUL_TOOM6H_THRESHOLD               306
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     134
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     145
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     118
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 38
+#define SQR_TOOM3_THRESHOLD                133
+#define SQR_TOOM4_THRESHOLD                212
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                620
+
+#define MULMID_TOOM42_THRESHOLD             68
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             595  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    595, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     17, 5}, {     35, 6}, {     28, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     99, 9}, {     55,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63, 9}, {    255,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,11}, \
+    {    159,10}, {    335,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,12}, \
+    {    191,11}, {    383,10}, {    767,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1119,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,10}, {   1727,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1119,12}, {    575,11}, {   1215,10}, {   2431,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,10}, \
+    {   2943,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,10}, {   3455,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2239,10}, \
+    {   4479,12}, {   1215,11}, {   2431,13}, {    639,12}, \
+    {   1471,11}, {   2943,13}, {    767,12}, {   1727,11}, \
+    {   3455,13}, {    895,12}, {   1983,14}, {    511,13}, \
+    {   1023,12}, {   2239,11}, {   4479,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2559,13}, {   1407,12}, \
+    {   2943,11}, {   5887,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   6015,15}, {   1535,14}, \
+    {   3839,13}, {   7679,16} }
+#define MUL_FFT_TABLE3_SIZE 170
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             525  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    525, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     39, 9}, \
+    {     23, 8}, {     55, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159, 6}, {   2687, 7}, \
+    {   1407, 9}, {    367, 8}, {    735, 9}, {    383,10}, \
+    {    207, 9}, {    415,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,10}, {   1727,12}, {    447,11}, {    991,10}, \
+    {   1983,13}, {    255,12}, {    511,11}, {   1119,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    895,11}, {   1791,12}, {    959,11}, \
+    {   1983,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1983,11}, {   3967,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2495,13}, {   1279,12}, \
+    {   2623,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3967,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,12}, {   4863,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3327,13}, \
+    {   6655,14}, {   3839,13}, {   7935,16} }
+#define SQR_FFT_TABLE3_SIZE 187
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  43
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                  42
+#define SQRLO_SQR_THRESHOLD              10323
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             132
+#define DC_BDIV_QR_THRESHOLD                83
+#define DC_BDIV_Q_THRESHOLD                130
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               189
+#define INV_APPR_THRESHOLD                 167
+
+#define BINV_NEWTON_THRESHOLD              372
+#define REDC_1_TO_REDC_N_THRESHOLD          83
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD               97
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1718
+
+#define POWM_SEC_TABLE  1,28,96,473,803
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               145
+#define SET_STR_PRECOMPUTE_THRESHOLD       419
+
+#define FAC_DSC_THRESHOLD                  114
+#define FAC_ODD_THRESHOLD                   57
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    1  /* 1.03% faster than 3 */
+#define HGCD_THRESHOLD                     117
+#define HGCD_APPR_THRESHOLD                137
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   389
+#define GCDEXT_DC_THRESHOLD                318
+#define JACOBI_BASE_METHOD                   4  /* 6.10% faster than 1 */
+
+/* Tuneup completed successfully, took 67994 seconds */

diff --git a/third_party/gmp/mpn/x86/coreisbr/gmp-mparam.h b/third_party/gmp/mpn/x86/coreisbr/gmp-mparam.h
new file mode 100644
index 0000000..23d708a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreisbr/gmp-mparam.h

@@ -0,0 +1,215 @@
+/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                28
+#define MOD_1_UNNORM_THRESHOLD              26
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      4
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 2  /* 88.29% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD             21
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              14
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define DIV_1_VS_MUL_1_PERCENT             297
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD               105
+#define MUL_TOOM44_THRESHOLD               190
+#define MUL_TOOM6H_THRESHOLD               294
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     144
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     116
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     160
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 48
+#define SQR_TOOM3_THRESHOLD                163
+#define SQR_TOOM4_THRESHOLD                250
+#define SQR_TOOM6_THRESHOLD                354
+#define SQR_TOOM8_THRESHOLD                502
+
+#define MULMID_TOOM42_THRESHOLD             98
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             666  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    666, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     99, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    159, 7}, {   1343, 8}, \
+    {    703, 9}, {    367, 8}, {    735, 9}, {    383,10}, \
+    {    207,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,12}, {    191,11}, \
+    {    383,10}, {    799,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2239,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1983,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1535,12}, \
+    {   3071,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define MUL_FFT_TABLE3_SIZE 163
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    570, 5}, {     28, 6}, {     15, 5}, {     32, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     40, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63, 8}, {   1023, 9}, \
+    {    543,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    991,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1983,14}, {    511,13}, \
+    {   1023,12}, {   2239,13}, {   1151,12}, {   2495,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,12}, \
+    {   3967,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2559,13}, {   5119,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 163
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD            16
+#define MULLO_DC_THRESHOLD                  46
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 159
+#define SQRLO_SQR_THRESHOLD              11317
+
+#define DC_DIV_QR_THRESHOLD                 47
+#define DC_DIVAPPR_Q_THRESHOLD             191
+#define DC_BDIV_QR_THRESHOLD               107
+#define DC_BDIV_Q_THRESHOLD                232
+
+#define INV_MULMOD_BNM1_THRESHOLD           62
+#define INV_NEWTON_THRESHOLD               181
+#define INV_APPR_THRESHOLD                 182
+
+#define BINV_NEWTON_THRESHOLD              378
+#define REDC_1_TO_REDC_N_THRESHOLD          91
+
+#define MU_DIV_QR_THRESHOLD               1858
+#define MU_DIVAPPR_Q_THRESHOLD            1858
+#define MUPI_DIV_QR_THRESHOLD               77
+#define MU_BDIV_QR_THRESHOLD              1830
+#define MU_BDIV_Q_THRESHOLD               2166
+
+#define POWM_SEC_TABLE  1,16,126,428,1442
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               418
+#define SET_STR_PRECOMPUTE_THRESHOLD      1104
+
+#define FAC_DSC_THRESHOLD                  149
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    1  /* 5.54% faster than 4 */
+#define HGCD_THRESHOLD                      66
+#define HGCD_APPR_THRESHOLD                135
+#define HGCD_REDUCE_THRESHOLD             4284
+#define GCD_DC_THRESHOLD                   642
+#define GCDEXT_DC_THRESHOLD                465
+#define JACOBI_BASE_METHOD                   3  /* 14.76% faster than 4 */
+
+/* Tuneup completed successfully, took 44241 seconds */

diff --git a/third_party/gmp/mpn/x86/darwin.m4 b/third_party/gmp/mpn/x86/darwin.m4
new file mode 100644
index 0000000..c449216
--- /dev/null
+++ b/third_party/gmp/mpn/x86/darwin.m4

@@ -0,0 +1,102 @@
+divert(-1)
+dnl  Copyright 2007, 2011, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`DARWIN')
+
+
+dnl  Usage LEA(symbol,reg)
+dnl  Usage LEAL(symbol_local_to_file,reg)
+dnl
+dnl  We maintain lists of stuff to append in load_eip and darwin_bd.  The
+dnl  `index' stuff is needed to suppress repeated definitions.  To avoid
+dnl  getting fooled by "var" and "var1", we add 'bol ' (the end of
+dnl  'indirect_symbol') at the beginning and and a newline at the end.  This
+dnl  might be a bit fragile.
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',`
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')
+ifelse(index(defn(`darwin_bd'), `bol $1
+'),-1,
+`m4append(`darwin_bd',
+`	.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L($1`'$non_lazy_ptr):
+	.indirect_symbol $1
+	.long	 0
+')')
+	call	L(movl_eip_`'substr($2,1))
+	movl	L($1`'$non_lazy_ptr)-.($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',`
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')
+	call	L(movl_eip_`'substr($2,1))
+	leal	$1-.($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+
+dnl ASM_END
+
+define(`ASM_END',`load_eip`'darwin_bd')
+
+define(`load_eip', `')		dnl updated in LEA
+define(`darwin_bd', `')		dnl updated in LEA
+
+
+dnl  Usage: CALL(funcname)
+dnl
+
+define(`CALL',
+m4_assert_numargs(1)
+`call	GSYM_PREFIX`'$1')
+
+undefine(`PIC_WITH_EBX')
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/x86/dive_1.asm b/third_party/gmp/mpn/x86/dive_1.asm
new file mode 100644
index 0000000..5bb0f45
--- /dev/null
+++ b/third_party/gmp/mpn/x86/dive_1.asm

@@ -0,0 +1,190 @@
+dnl  x86 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54    30.0
+C P55    29.0
+C P6     13.0 odd divisor, 12.0 even (strangely)
+C K6     14.0
+C K7     12.0
+C P4     42.0
+
+
+C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi	FRAME_pushl()
+
+	pushl	%ebx	FRAME_pushl()
+	movl	$-1, %ecx		C shift count
+
+	pushl	%esi	FRAME_pushl()
+
+L(strip_twos):
+	incl	%ecx
+
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	imull	%ebx, %eax		C inv*inv*d
+
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+	leal	(%edx,%edx), %eax	C 2*inv
+
+	imull	%edx, %edx		C inv*inv
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	imull	%ebx, %edx		C inv*inv*d
+
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	xorl	%ebx, %ebx
+	xorl	%edx, %edx
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	VAR_INVERSE, %edx
+	jmp	L(entry)
+
+
+	ALIGN(8)
+	nop	C k6 code alignment
+	nop
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or -1
+	C ecx	shift
+	C edx	carry limb
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	movl	-4(%esi,%ebp,4), %eax
+	subl	%ebx, %edx		C accumulate carry bit
+
+	movl	(%esi,%ebp,4), %ebx
+
+	shrdl(	%cl, %ebx, %eax)
+
+	subl	%edx, %eax		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebx, %ebx
+
+L(entry):
+	imull	%edx, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	movl	PARAM_DIVISOR, %edx
+
+	mull	%edx
+
+	incl	%ebp
+	jnz	L(top)
+
+
+	movl	-4(%esi), %eax		C src high limb
+L(one):
+	shrl	%cl, %eax
+	popl	%esi	FRAME_popl()
+
+	addl	%ebx, %eax		C apply carry bit
+	popl	%ebx	FRAME_popl()
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/divrem_1.asm b/third_party/gmp/mpn/x86/divrem_1.asm
new file mode 100644
index 0000000..255d493
--- /dev/null
+++ b/third_party/gmp/mpn/x86/divrem_1.asm

@@ -0,0 +1,233 @@
+dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
+
+dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       cycles/limb
+C 486   approx 43 maybe
+C P5        44
+C P6        39
+C P6MMX     39
+C K6        22
+C K7        42
+C P4        58
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                          mp_limb_t carry);
+C
+C Divide src,size by divisor and store the quotient in dst+xsize,size.
+C Extend the division to fractional quotient limbs in dst,xsize.  Return the
+C remainder.  Either or both xsize and size can be 0.
+C
+C mpn_divrem_1c takes a carry parameter which is an initial high limb,
+C effectively one extra limb at the top of src,size.  Must have
+C carry<divisor.
+C
+C
+C Essentially the code is the same as the division based part of
+C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
+C instruction even when gcc is not being used (when longlong.h only has the
+C rather slow generic C udiv_qrnnd().
+C
+C A test is done to see if the high limb is less than the divisor, and if so
+C one less div is done.  A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount.  The branch misprediction penalty on each chip is less
+C than half a div.
+C
+C
+C Notes for P5:
+C
+C It might be thought that moving the load down to pair with the store would
+C save 1 cycle, but that doesn't seem to happen in practice, and in any case
+C would be a mere 2.2% saving, so it's hardly worth bothering about.
+C
+C A mul-by-inverse might be a possibility for P5, as done in
+C mpn/x86/pentium/mod_1.asm.  The number of auxiliary instructions required
+C is a hinderance, but there could be a 10-15% speedup available.
+C
+C
+C Notes for K6:
+C
+C K6 has its own version of this code, using loop and paying attention to
+C cache line boundary crossings.  The target 20 c/l can be had with the
+C decl+jnz of the present code by pairing up the load and store in the
+C loops.  But it's considered easier not to introduce complexity just for
+C that, but instead let k6 have its own code.
+C
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jz	L(fraction)
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jmp	L(integer_top)
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx,%ecx
+
+	jz	L(size_zero)
+	pushl	%ebx		FRAME_pushl()
+
+	movl	-4(%edi,%ecx,4), %eax	C src high limb
+	xorl	%edx, %edx
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	cmpl	%esi, %eax
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jae	L(integer_entry)
+
+
+	C high<divisor, so high of dst is zero, and avoid one div
+
+	movl	%edx, (%ebx,%ecx,4)
+	decl	%ecx
+
+	movl	%eax, %edx
+	jz	L(fraction)
+
+
+L(integer_top):
+	C eax	scratch (quotient)
+	C ebx	dst+4*xsize-4
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi	src
+	C ebp	xsize
+
+	movl	-4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+	divl	%esi
+
+	movl	%eax, (%ebx,%ecx,4)
+	decl	%ecx
+	jnz	L(integer_top)
+
+
+L(fraction):
+	orl	%ebp, %ecx
+	jz	L(done)
+
+	movl	PARAM_DST, %ebx
+
+
+L(fraction_top):
+	C eax	scratch (quotient)
+	C ebx	dst
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	xorl	%eax, %eax
+
+	divl	%esi
+
+	movl	%eax, -4(%ebx,%ecx,4)
+	decl	%ecx
+	jnz	L(fraction_top)
+
+
+L(done):
+	popl	%ebp
+	movl	%edx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+	movl	PARAM_XSIZE, %ecx
+	xorl	%eax, %eax
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README
+
+	rep
+	stosl
+
+	popl	%esi
+	popl	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/divrem_2.asm b/third_party/gmp/mpn/x86/divrem_2.asm
new file mode 100644
index 0000000..4c38ad0
--- /dev/null
+++ b/third_party/gmp/mpn/x86/divrem_2.asm

@@ -0,0 +1,199 @@
+dnl  x86 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		norm	frac
+C 486
+C P5
+C P6-13		29.2
+C P6-15		*26
+C K6
+C K7		22
+C K8		*19
+C P4-f1
+C P4-f2		*65
+C P4-f3
+C P4-f4		*72
+
+C A star means numbers not updated for the latest version of the code.
+
+
+C TODO
+C  * Perhaps keep ecx or esi in stack slot, freeing up a reg for q0.
+C  * The loop has not been carefully tuned.  We should at the very least do
+C    some local insn swapping.
+C  * The code outside the main loop is what gcc generated.  Clean up!
+C  * Clean up stack slot usage.
+
+C INPUT PARAMETERS
+C qp
+C fn
+C up_param
+C un_param
+C dp
+
+
+C eax ebx ecx edx esi edi ebp
+C         cnt         qp
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_divrem_2)
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	sub	$36, %esp
+	mov	68(%esp), %ecx		C un
+	mov	72(%esp), %esi		C dp
+	movl	$0, 32(%esp)
+	lea	0(,%ecx,4), %edi
+	add	64(%esp), %edi		C up
+	mov	(%esi), %ebx
+	mov	4(%esi), %eax
+	mov	%ebx, 20(%esp)
+	sub	$12, %edi
+	mov	%eax, 24(%esp)
+	mov	%edi, 12(%esp)
+	mov	8(%edi), %ebx
+	mov	4(%edi), %ebp
+	cmp	%eax, %ebx
+	jb	L(8)
+	seta	%dl
+	cmp	20(%esp), %ebp
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	jne	L(35)
+L(8):
+	mov	60(%esp), %esi		C fn
+	lea	-3(%esi,%ecx), %edi
+	test	%edi, %edi
+	js	L(9)
+	mov	24(%esp), %edx
+	mov	$-1, %esi
+	mov	%esi, %eax
+	mov	%esi, %ecx
+	not	%edx
+	divl	24(%esp)
+	mov	%eax, %esi
+	imul	24(%esp), %eax
+	mov	%eax, (%esp)
+	mov	%esi, %eax
+	mull	20(%esp)
+	mov	(%esp), %eax
+	add	20(%esp), %eax
+	adc	$0, %ecx
+	add	%eax, %edx
+	adc	$0, %ecx
+	mov	%ecx, %eax
+	js	L(32)
+L(36):	dec	%esi
+	sub	24(%esp), %edx
+	sbb	$0, %eax
+	jns	L(36)
+L(32):
+	mov	%esi, 16(%esp)		C di
+	mov	%edi, %ecx		C un
+	mov	12(%esp), %esi		C up
+	mov	24(%esp), %eax
+	neg	%eax
+	mov	%eax, 4(%esp)		C -d1
+	ALIGN(16)
+	nop
+
+C eax ebx ecx edx esi edi ebp  0    4   8   12  16  20  24  28  32   56  60
+C     n2  un      up      n1   q0  -d1          di  d0  d1      msl  qp  fn
+
+L(loop):
+	mov	16(%esp), %eax		C di
+	mul	%ebx
+	add	%ebp, %eax
+	mov	%eax, (%esp)		C q0
+	adc	%ebx, %edx
+	mov	%edx, %edi		C q
+	imul	4(%esp), %edx
+	mov	20(%esp), %eax
+	lea	(%edx, %ebp), %ebx	C n1 -= ...
+	mul	%edi
+	xor	%ebp, %ebp
+	cmp	60(%esp), %ecx
+	jl	L(19)
+	mov	(%esi), %ebp
+	sub	$4, %esi
+L(19):	sub	20(%esp), %ebp
+	sbb	24(%esp), %ebx
+	sub	%eax, %ebp
+	sbb	%edx, %ebx
+	mov	20(%esp), %eax		C d1
+	inc	%edi
+	xor	%edx, %edx
+	cmp	(%esp), %ebx
+	adc	$-1, %edx		C mask
+	add	%edx, %edi		C q--
+	and	%edx, %eax		C d0 or 0
+	and	24(%esp), %edx		C d1 or 0
+	add	%eax, %ebp
+	adc	%edx, %ebx
+	cmp	24(%esp), %ebx
+	jae	L(fix)
+L(bck):	mov	56(%esp), %edx
+	mov	%edi, (%edx, %ecx, 4)
+	dec	%ecx
+	jns	L(loop)
+
+L(9):	mov	64(%esp), %esi		C up
+	mov	%ebp, (%esi)
+	mov	%ebx, 4(%esi)
+	mov	32(%esp), %eax
+	add	$36, %esp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	ret
+
+L(fix):	seta	%dl
+	cmp	20(%esp), %ebp
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck)
+	inc	%edi
+	sub	20(%esp), %ebp
+	sbb	24(%esp), %ebx
+	jmp	L(bck)
+
+L(35):	sub	20(%esp), %ebp
+	sbb	24(%esp), %ebx
+	movl	$1, 32(%esp)
+	jmp	L(8)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/fat/com.c b/third_party/gmp/mpn/x86/fat/com.c
new file mode 100644
index 0000000..d359d4c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/com.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_com.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/com.c"

diff --git a/third_party/gmp/mpn/x86/fat/fat.c b/third_party/gmp/mpn/x86/fat/fat.c
new file mode 100644
index 0000000..18be05a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/fat.c

@@ -0,0 +1,499 @@
+/* x86 fat binary initializers.
+
+   THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY.
+   THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR
+   COMPLETELY IN FUTURE GNU MP RELEASES.
+
+Copyright 2003, 2004, 2011-2013, 2015, 2017, 2018 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>    /* for printf */
+#include <stdlib.h>   /* for getenv */
+#include <string.h>
+
+#include "gmp-impl.h"
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+/* fat_entry.asm */
+long __gmpn_cpuid (char [12], int);
+int  __gmpn_cpuid_available (void);
+
+
+#if WANT_FAKE_CPUID
+/* The "name"s in the table are values for the GMP_CPU_TYPE environment
+   variable.  Anything can be used, but for now it's the canonical cpu types
+   as per config.guess/config.sub.  */
+
+#define __gmpn_cpuid            fake_cpuid
+#define __gmpn_cpuid_available  fake_cpuid_available
+
+#define MAKE_FMS(family, model)						\
+  ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20)			\
+   + (((model) & 0xf) << 4) + (((model)  &  0xf0) << 12))
+
+static struct {
+  const char  *name;
+  const char  *vendor;
+  unsigned    fms;
+} fake_cpuid_table[] = {
+  { "i386",       "" },
+  { "i486",       "GenuineIntel", MAKE_FMS (4, 0) },
+  { "pentium",    "GenuineIntel", MAKE_FMS (5, 0) },
+  { "pentiummmx", "GenuineIntel", MAKE_FMS (5, 4) },
+  { "pentiumpro", "GenuineIntel", MAKE_FMS (6, 0) },
+  { "pentium2",   "GenuineIntel", MAKE_FMS (6, 2) },
+  { "pentium3",   "GenuineIntel", MAKE_FMS (6, 7) },
+  { "pentium4",   "GenuineIntel", MAKE_FMS (15, 2) },
+  { "prescott",   "GenuineIntel", MAKE_FMS (15, 3) },
+  { "nocona",     "GenuineIntel", MAKE_FMS (15, 4) },
+  { "core2",      "GenuineIntel", MAKE_FMS (6, 0xf) },
+  { "nehalem",    "GenuineIntel", MAKE_FMS (6, 0x1a) },
+  { "nhm",        "GenuineIntel", MAKE_FMS (6, 0x1a) },
+  { "atom",       "GenuineIntel", MAKE_FMS (6, 0x1c) },
+  { "westmere",   "GenuineIntel", MAKE_FMS (6, 0x25) },
+  { "wsm",        "GenuineIntel", MAKE_FMS (6, 0x25) },
+  { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) },
+  { "sbr",        "GenuineIntel", MAKE_FMS (6, 0x2a) },
+  { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) },
+  { "slm",        "GenuineIntel", MAKE_FMS (6, 0x37) },
+  { "haswell",    "GenuineIntel", MAKE_FMS (6, 0x3c) },
+  { "hwl",        "GenuineIntel", MAKE_FMS (6, 0x3c) },
+  { "broadwell",  "GenuineIntel", MAKE_FMS (6, 0x3d) },
+  { "bwl",        "GenuineIntel", MAKE_FMS (6, 0x3d) },
+  { "skylake",    "GenuineIntel", MAKE_FMS (6, 0x5e) },
+  { "sky",        "GenuineIntel", MAKE_FMS (6, 0x5e) },
+
+  { "k5",         "AuthenticAMD", MAKE_FMS (5, 0) },
+  { "k6",         "AuthenticAMD", MAKE_FMS (5, 3) },
+  { "k62",        "AuthenticAMD", MAKE_FMS (5, 8) },
+  { "k63",        "AuthenticAMD", MAKE_FMS (5, 9) },
+  { "athlon",     "AuthenticAMD", MAKE_FMS (6, 0) },
+  { "k8",         "AuthenticAMD", MAKE_FMS (15, 0) },
+  { "k10",        "AuthenticAMD", MAKE_FMS (16, 0) },
+  { "bobcat",     "AuthenticAMD", MAKE_FMS (20, 1) },
+  { "bulldozer",  "AuthenticAMD", MAKE_FMS (21, 1) },
+  { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) },
+  { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) },
+  { "excavator",  "AuthenticAMD", MAKE_FMS (21, 0x60) },
+  { "jaguar",     "AuthenticAMD", MAKE_FMS (22, 1) },
+
+  { "viac3",      "CentaurHauls", MAKE_FMS (6, 0) },
+  { "viac32",     "CentaurHauls", MAKE_FMS (6, 9) },
+  { "nano",       "CentaurHauls", MAKE_FMS (6, 15) },
+};
+
+static int
+fake_cpuid_lookup (void)
+{
+  char  *s;
+  int   i;
+
+  s = getenv ("GMP_CPU_TYPE");
+  if (s == NULL)
+    {
+      printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n");
+      abort ();
+    }
+
+  for (i = 0; i < numberof (fake_cpuid_table); i++)
+    if (strcmp (s, fake_cpuid_table[i].name) == 0)
+      return i;
+
+  printf ("GMP_CPU_TYPE=%s unknown\n", s);
+  abort ();
+}
+
+static int
+fake_cpuid_available (void)
+{
+  return fake_cpuid_table[fake_cpuid_lookup()].vendor[0] != '\0';
+}
+
+static long
+fake_cpuid (char dst[12], int id)
+{
+  int  i = fake_cpuid_lookup();
+
+  switch (id) {
+  case 0:
+    memcpy (dst, fake_cpuid_table[i].vendor, 12);
+    return 0;
+  case 1:
+    return fake_cpuid_table[i].fms;
+  default:
+    printf ("fake_cpuid(): oops, unknown id %d\n", id);
+    abort ();
+  }
+}
+#endif
+
+
+typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t));
+typedef DECL_preinv_mod_1    ((*preinv_mod_1_t));
+
+struct cpuvec_t __gmpn_cpuvec = {
+  __MPN(add_n_init),
+  0,
+  0,
+  __MPN(addmul_1_init),
+  0,
+  __MPN(bdiv_dbm1c_init),
+  __MPN(cnd_add_n_init),
+  __MPN(cnd_sub_n_init),
+  __MPN(com_init),
+  __MPN(copyd_init),
+  __MPN(copyi_init),
+  __MPN(divexact_1_init),
+  __MPN(divrem_1_init),
+  __MPN(gcd_11_init),
+  __MPN(lshift_init),
+  __MPN(lshiftc_init),
+  __MPN(mod_1_init),
+  __MPN(mod_1_1p_init),
+  __MPN(mod_1_1p_cps_init),
+  __MPN(mod_1s_2p_init),
+  __MPN(mod_1s_2p_cps_init),
+  __MPN(mod_1s_4p_init),
+  __MPN(mod_1s_4p_cps_init),
+  __MPN(mod_34lsub1_init),
+  __MPN(modexact_1c_odd_init),
+  __MPN(mul_1_init),
+  __MPN(mul_basecase_init),
+  __MPN(mullo_basecase_init),
+  __MPN(preinv_divrem_1_init),
+  __MPN(preinv_mod_1_init),
+  __MPN(redc_1_init),
+  __MPN(redc_2_init),
+  __MPN(rshift_init),
+  __MPN(sqr_basecase_init),
+  __MPN(sub_n_init),
+  0,
+  __MPN(submul_1_init),
+  0
+};
+
+int __gmpn_cpuvec_initialized = 0;
+
+/* The following setups start with generic x86, then overwrite with
+   specifics for a chip, and higher versions of that chip.
+
+   The arrangement of the setups here will normally be the same as the $path
+   selections in configure.in for the respective chips.
+
+   This code is reentrant and thread safe.  We always calculate the same
+   decided_cpuvec, so if two copies of the code are running it doesn't
+   matter which completes first, both write the same to __gmpn_cpuvec.
+
+   We need to go via decided_cpuvec because if one thread has completed
+   __gmpn_cpuvec then it may be making use of the threshold values in that
+   vector.  If another thread is still running __gmpn_cpuvec_init then we
+   don't want it to write different values to those fields since some of the
+   asm routines only operate correctly up to their own defined threshold,
+   not an arbitrary value.  */
+
+void
+__gmpn_cpuvec_init (void)
+{
+  struct cpuvec_t  decided_cpuvec;
+
+  TRACE (printf ("__gmpn_cpuvec_init:\n"));
+
+  memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec));
+
+  CPUVEC_SETUP_x86;
+  CPUVEC_SETUP_fat;
+
+  if (! __gmpn_cpuid_available ())
+    {
+      TRACE (printf ("  80386, or early 80486 without cpuid\n"));
+    }
+  else
+    {
+      char vendor_string[13];
+      char dummy_string[12];
+      long fms;
+      int family, model;
+
+      __gmpn_cpuid (vendor_string, 0);
+      vendor_string[12] = 0;
+
+      fms = __gmpn_cpuid (dummy_string, 1);
+      family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff);
+      model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0);
+
+      if (strcmp (vendor_string, "GenuineIntel") == 0)
+        {
+          switch (family)
+            {
+            case 4:
+              TRACE (printf ("  80486 with cpuid\n"));
+              break;
+
+            case 5:
+              TRACE (printf ("  pentium\n"));
+              CPUVEC_SETUP_pentium;
+              if (model == 4 || model == 8)
+                {
+                  TRACE (printf ("  pentiummmx\n"));
+                  CPUVEC_SETUP_pentium_mmx;
+                }
+              break;
+
+            case 6:
+              TRACE (printf ("  p6\n"));
+              CPUVEC_SETUP_p6;
+	      switch (model)
+		{
+		case 0x00:
+		case 0x01:
+		  TRACE (printf ("  pentiumpro\n"));
+		  break;
+
+		case 0x02:
+		case 0x03:
+		case 0x04:
+		case 0x05:
+		case 0x06:
+		  TRACE (printf ("  pentium2\n"));
+                  CPUVEC_SETUP_p6_mmx;
+		  break;
+
+		case 0x07:
+		case 0x08:
+		case 0x0a:
+		case 0x0b:
+		case 0x0c:
+		  TRACE (printf ("  pentium3\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+		  break;
+
+		case 0x09:		/* Banias */
+		case 0x0d:		/* Dothan */
+		case 0x0e:		/* Yonah */
+		  TRACE (printf ("  Banias/Dothan/Yonah\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  break;
+
+		case 0x0f:		/* Conroe Merom Kentsfield Allendale */
+		case 0x10:
+		case 0x11:
+		case 0x12:
+		case 0x13:
+		case 0x14:
+		case 0x15:
+		case 0x16:
+		case 0x17:		/* PNR Wolfdale Yorkfield */
+		case 0x18:
+		case 0x19:
+		case 0x1d:		/* PNR Dunnington */
+		  TRACE (printf ("  Conroe\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  CPUVEC_SETUP_core2;
+		  break;
+
+		case 0x1c:		/* Atom Silverthorne */
+		case 0x26:		/* Atom Lincroft */
+		case 0x27:		/* Atom Saltwell */
+		case 0x36:		/* Atom Cedarview/Saltwell */
+		  TRACE (printf ("  atom\n"));
+		  CPUVEC_SETUP_atom;
+		  CPUVEC_SETUP_atom_mmx;
+		  CPUVEC_SETUP_atom_sse2;
+		  break;
+
+		case 0x1a:		/* NHM Gainestown */
+		case 0x1b:
+		case 0x1e:		/* NHM Lynnfield/Jasper */
+		case 0x1f:
+		case 0x20:
+		case 0x21:
+		case 0x22:
+		case 0x23:
+		case 0x24:
+		case 0x25:		/* WSM Clarkdale/Arrandale */
+		case 0x28:
+		case 0x29:
+		case 0x2b:
+		case 0x2c:		/* WSM Gulftown */
+		case 0x2e:		/* NHM Beckton */
+		case 0x2f:		/* WSM Eagleton */
+		  TRACE (printf ("  nehalem/westmere\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  CPUVEC_SETUP_core2;
+		  CPUVEC_SETUP_coreinhm;
+		  break;
+
+		case 0x2a:		/* SBR */
+		case 0x2d:		/* SBR-EP */
+		case 0x3a:		/* IBR */
+		case 0x3c:		/* Haswell client */
+		case 0x3f:		/* Haswell server */
+		case 0x45:		/* Haswell ULT */
+		case 0x46:		/* Crystal Well */
+		case 0x3d:		/* Broadwell */
+		case 0x47:		/* Broadwell */
+		case 0x4f:		/* Broadwell server */
+		case 0x56:		/* Broadwell microserver */
+		case 0x4e:		/* Skylake client */
+		case 0x55:		/* Skylake server */
+		case 0x5e:		/* Skylake */
+		case 0x8e:		/* Kabylake */
+		case 0x9e:		/* Kabylake */
+		  TRACE (printf ("  sandybridge\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  CPUVEC_SETUP_core2;
+		  CPUVEC_SETUP_coreinhm;
+		  CPUVEC_SETUP_coreisbr;
+		  break;
+		}
+              break;
+
+            case 15:
+              TRACE (printf ("  pentium4\n"));
+              CPUVEC_SETUP_pentium4;
+              CPUVEC_SETUP_pentium4_mmx;
+              CPUVEC_SETUP_pentium4_sse2;
+              break;
+            }
+        }
+      else if (strcmp (vendor_string, "AuthenticAMD") == 0)
+        {
+          switch (family)
+            {
+            case 5:
+              if (model <= 3)
+                {
+                  TRACE (printf ("  k5\n"));
+                }
+              else
+                {
+                  TRACE (printf ("  k6\n"));
+                  CPUVEC_SETUP_k6;
+                  CPUVEC_SETUP_k6_mmx;
+                  if (model >= 8)
+                    {
+                      TRACE (printf ("  k62\n"));
+                      CPUVEC_SETUP_k6_k62mmx;
+                    }
+                  if (model >= 9)
+                    {
+                      TRACE (printf ("  k63\n"));
+                    }
+                }
+              break;
+            case 6:
+              TRACE (printf ("  athlon\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+              break;
+
+            case 0x0f:		/* k8 */
+            case 0x11:		/* "fam 11h", mix of k8 and k10 */
+            case 0x13:		/* unknown, conservatively assume k8  */
+            case 0x16:		/* unknown, conservatively assume k8  */
+            case 0x17:		/* unknown, conservatively assume k8  */
+              TRACE (printf ("  k8\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+              CPUVEC_SETUP_k8;
+	      break;
+
+            case 0x10:		/* k10 */
+            case 0x12:		/* k10 (llano) */
+              TRACE (printf ("  k10\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+	      break;
+
+            case 0x14:		/* bobcat */
+              TRACE (printf ("  bobcat\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+              CPUVEC_SETUP_bt1;
+	      break;
+
+            case 0x15:		/* bulldozer */
+              TRACE (printf ("  bulldozer\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+	      break;
+            }
+        }
+      else if (strcmp (vendor_string, "CentaurHauls") == 0)
+        {
+          switch (family)
+            {
+            case 6:
+              TRACE (printf ("  viac3\n"));
+              if (model >= 9)
+                {
+                  TRACE (printf ("  viac32\n"));
+                }
+	      if (model >= 15)
+		{
+                  TRACE (printf ("  nano\n"));
+		  CPUVEC_SETUP_nano;
+		}
+              break;
+            }
+        }
+      else if (strcmp (vendor_string, "CyrixInstead") == 0)
+        {
+          /* Should recognize Cyrix' processors too.  */
+          TRACE (printf ("  cyrix something\n"));
+        }
+    }
+
+  /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1.
+     Instead default to the plain versions from whichever CPU we detected.
+     The function arguments are compatible, no need for any glue code.  */
+  if (decided_cpuvec.preinv_divrem_1 == NULL)
+    decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1;
+  if (decided_cpuvec.preinv_mod_1 == NULL)
+    decided_cpuvec.preinv_mod_1    =(preinv_mod_1_t)   decided_cpuvec.mod_1;
+
+  ASSERT_CPUVEC (decided_cpuvec);
+  CPUVEC_INSTALL (decided_cpuvec);
+
+  /* Set this once the threshold fields are ready.
+     Use volatile to prevent it getting moved.  */
+  *((volatile int *) &__gmpn_cpuvec_initialized) = 1;
+}

diff --git a/third_party/gmp/mpn/x86/fat/fat_entry.asm b/third_party/gmp/mpn/x86/fat/fat_entry.asm
new file mode 100644
index 0000000..25655cf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/fat_entry.asm

@@ -0,0 +1,243 @@
+dnl  x86 fat binary entrypoints.
+
+dnl  Copyright 2003, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl  Forcibly disable profiling.
+dnl
+dnl  The entrypoints and inits are small enough not to worry about, the real
+dnl  routines arrived at will have any profiling.  Also, the way the code
+dnl  here ends with a jump means we won't work properly with the
+dnl  "instrument" profiling scheme anyway.
+
+define(`WANT_PROFILING',no)
+
+
+	TEXT
+
+
+dnl  Usage: FAT_ENTRY(name, offset)
+dnl
+dnl  Emit a fat binary entrypoint function of the given name.  This is the
+dnl  normal entry for applications, eg. __gmpn_add_n.
+dnl
+dnl  The code simply jumps through the function pointer in __gmpn_cpuvec at
+dnl  the given "offset" (in bytes).
+dnl
+dnl  For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be
+dnl  fine for all x86s.
+dnl
+dnl  For PIC, the jumps are 20 bytes each, and are best aligned to 16 to
+dnl  ensure at least the first two instructions don't cross a cache line
+dnl  boundary.
+dnl
+dnl  Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE
+dnl  grepping in configure, stopping that code trying to eval something with
+dnl  $1 in it.
+
+define(FAT_ENTRY,
+m4_assert_numargs(2)
+`	ALIGN(ifdef(`PIC',16,8))
+`'PROLOGUE($1)dnl
+ifdef(`PIC',`dnl
+ifdef(`DARWIN',`
+	call	L(movl_eip_edx)
+	movl	L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx
+	jmp	*m4_empty_if_zero($2)(%edx)
+',`dnl
+	call	L(movl_eip_edx)
+L(entry_here$2):
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(entry_here$2)], %edx
+	movl	GSYM_PREFIX`'__gmpn_cpuvec@GOT(%edx), %edx
+	jmp	*m4_empty_if_zero($2)(%edx)
+')
+',`dnl non-PIC
+	jmp	*GSYM_PREFIX`'__gmpn_cpuvec+$2
+')
+EPILOGUE()
+')
+
+
+dnl  FAT_ENTRY for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_ENTRY(MPN(i),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 4))',
+CPUVEC_FUNCS_LIST)
+
+ifdef(`PIC',`
+	ALIGN(8)
+L(movl_eip_edx):
+	movl	(%esp), %edx
+	ret_internal
+ifdef(`DARWIN',`
+	.section	__IMPORT,__pointers,non_lazy_symbol_pointers
+L(___gmpn_cpuvec)$non_lazy_ptr:
+	.indirect_symbol	___gmpn_cpuvec
+	.long	0
+	TEXT
+')
+')
+
+
+dnl  Usage: FAT_INIT(name, offset)
+dnl
+dnl  Emit a fat binary initializer function of the given name.  These
+dnl  functions are the initial values for the pointers in __gmpn_cpuvec.
+dnl
+dnl  The code simply calls __gmpn_cpuvec_init, and then jumps back through
+dnl  the __gmpn_cpuvec pointer, at the given "offset" (in bytes).
+dnl  __gmpn_cpuvec_init will have stored the address of the selected
+dnl  implementation there.
+dnl
+dnl  Only one of these routines will be executed, and only once, since after
+dnl  that all the __gmpn_cpuvec pointers go to real routines.  So there's no
+dnl  need for anything special here, just something small and simple.  To
+dnl  keep code size down, "fat_init" is a shared bit of code, arrived at
+dnl  with the offset in %al.  %al is used since the movb instruction is 2
+dnl  bytes where %eax would be 4.
+dnl
+dnl  Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the
+dnl  HAVE_NATIVE grepping in configure, preventing that code trying to eval
+dnl  something with $1 in it.
+
+define(FAT_INIT,
+m4_assert_numargs(2)
+`PROLOGUE($1)dnl
+	movb	$`'$2, %al
+	jmp	L(fat_init)
+EPILOGUE()
+')
+
+L(fat_init):
+	C al	__gmpn_cpuvec byte offset
+
+	movzbl	%al, %eax
+	pushl	%eax
+
+ifdef(`PIC',`dnl
+ifdef(`DARWIN',`
+	sub	$8, %esp
+	CALL(	__gmpn_cpuvec_init)
+	add	$8, %esp
+	call	L(movl_eip_edx)
+	movl	L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx
+',`dnl
+	pushl	%ebx
+	call	L(movl_eip_ebx)
+L(init_here):
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(init_here)], %ebx
+	CALL(	__gmpn_cpuvec_init)
+	movl	GSYM_PREFIX`'__gmpn_cpuvec@GOT(%ebx), %edx
+	popl	%ebx
+')
+	popl	%eax
+	jmp	*(%edx,%eax)
+
+L(movl_eip_ebx):
+	movl	(%esp), %ebx
+	ret_internal
+',`dnl non-PIC
+	sub	$8, %esp		C needed on Darwin, harmless elsewhere
+	CALL(	__gmpn_cpuvec_init)
+	add	$8, %esp		C needed on Darwin, harmless elsewhere
+	popl	%eax
+	jmp	*GSYM_PREFIX`'__gmpn_cpuvec(%eax)
+')
+
+dnl  FAT_INIT for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_INIT(MPN(i`'_init),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 4))',
+CPUVEC_FUNCS_LIST)
+
+
+
+C long __gmpn_cpuid (char dst[12], int id);
+C
+C This is called only once, so just something simple and compact is fine.
+
+defframe(PARAM_ID,  8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+PROLOGUE(__gmpn_cpuid)
+	pushl	%esi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+	movl	PARAM_ID, %eax
+	cpuid
+	movl	PARAM_DST, %esi
+	movl	%ebx, (%esi)
+	movl	%edx, 4(%esi)
+	movl	%ecx, 8(%esi)
+	popl	%ebx
+	popl	%esi
+	ret
+EPILOGUE()
+
+
+C int __gmpn_cpuid_available (void);
+C
+C Return non-zero if the cpuid instruction is available, which means late
+C model 80486 and higher.  80386 and early 80486 don't have cpuid.
+C
+C The test follows Intel AP-485 application note, namely that if bit 21 is
+C modifiable then cpuid is supported.  This test is reentrant and thread
+C safe, since of course any interrupt or context switch will preserve the
+C flags while we're tinkering with them.
+C
+C This is called only once, so just something simple and compact is fine.
+
+PROLOGUE(__gmpn_cpuid_available)
+	pushf
+	popl	%ecx		C old flags
+
+	movl	%ecx, %edx
+	xorl	$0x200000, %edx
+	pushl	%edx
+	popf
+	pushf
+	popl	%edx		C tweaked flags
+
+	movl	$1, %eax
+	cmpl	%ecx, %edx
+	jne	L(available)
+	xorl	%eax, %eax	C not changed, so cpuid not available
+
+L(available):
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/fat/gmp-mparam.h b/third_party/gmp/mpn/x86/fat/gmp-mparam.h
new file mode 100644
index 0000000..3641a6b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/gmp-mparam.h

@@ -0,0 +1,71 @@
+/* Fat binary x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2003, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes.  The only time
+   this might not be true currently is for actual 80386 and 80486 chips,
+   where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but
+   that's not worth worrying about.  */
+#define DIVEXACT_1_THRESHOLD  0
+
+/* Only some of the x86s have an mpn_preinv_divrem_1, but we set
+   USE_PREINV_DIVREM_1 so that all callers use it, and then let the
+   __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual
+   preinv.  */
+#define USE_PREINV_DIVREM_1   1
+
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need
+   for mpn_sqr to call the latter.  */
+#define SQR_BASECASE_THRESHOLD 0
+
+/* Sensible fallbacks for these, when not taken from a cpu-specific
+   gmp-mparam.h.  */
+#define MUL_TOOM22_THRESHOLD      20
+#define MUL_TOOM33_THRESHOLD     130
+#define SQR_TOOM2_THRESHOLD       30
+#define SQR_TOOM3_THRESHOLD      200
+
+/* These are values more or less in the middle of what the typical x86 chips
+   come out as.  For a fat binary it's necessary to have values for these,
+   since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out
+   as non-constant array initializers.  FIXME: Perhaps these should be done
+   in the cpuvec structure like other thresholds.  */
+#define MUL_FFT_TABLE  { 464, 928, 1920, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          400
+#define MUL_FFT_THRESHOLD              2000
+
+#define SQR_FFT_TABLE  { 528, 1184, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          500
+#define SQR_FFT_THRESHOLD              3000

diff --git a/third_party/gmp/mpn/x86/fat/lshiftc.c b/third_party/gmp/mpn/x86/fat/lshiftc.c
new file mode 100644
index 0000000..9ecf489
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/lshiftc.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_lshiftc.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/lshiftc.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1.c b/third_party/gmp/mpn/x86/fat/mod_1.c
new file mode 100644
index 0000000..4f149cc
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mod_1.
+
+Copyright 2003, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mod_1.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1_1.c b/third_party/gmp/mpn/x86/fat/mod_1_1.c
new file mode 100644
index 0000000..92eaa7a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1_1.c

@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1_1p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+PROLOGUE(mpn_mod_1_1p_cps)
+*/
+
+#define OPERATION_mod_1_1_cps 1
+#include "mpn/generic/mod_1_1.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1_2.c b/third_party/gmp/mpn/x86/fat/mod_1_2.c
new file mode 100644
index 0000000..9095a61
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1_2.c

@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1s_2p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+PROLOGUE(mpn_mod_1s_2p_cps)
+*/
+
+#define OPERATION_mod_1_2_cps 1
+#include "mpn/generic/mod_1_2.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1_4.c b/third_party/gmp/mpn/x86/fat/mod_1_4.c
new file mode 100644
index 0000000..51c0def
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1_4.c

@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1s_4p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+PROLOGUE(mpn_mod_1s_4p_cps)
+*/
+
+#define OPERATION_mod_1_4_cps 1
+#include "mpn/generic/mod_1_4.c"

diff --git a/third_party/gmp/mpn/x86/fat/mode1o.c b/third_party/gmp/mpn/x86/fat/mode1o.c
new file mode 100644
index 0000000..870ddb8
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mode1o.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_modexact_1c_odd.
+
+Copyright 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mode1o.c"

diff --git a/third_party/gmp/mpn/x86/fat/mullo_basecase.c b/third_party/gmp/mpn/x86/fat/mullo_basecase.c
new file mode 100644
index 0000000..7f86be6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mullo_basecase.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mullo_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mullo_basecase.c"

diff --git a/third_party/gmp/mpn/x86/fat/redc_1.c b/third_party/gmp/mpn/x86/fat/redc_1.c
new file mode 100644
index 0000000..0025403
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/redc_1.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_1.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/redc_1.c"

diff --git a/third_party/gmp/mpn/x86/fat/redc_2.c b/third_party/gmp/mpn/x86/fat/redc_2.c
new file mode 100644
index 0000000..1932d58
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/redc_2.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_2.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/redc_2.c"

diff --git a/third_party/gmp/mpn/x86/gcd_11.asm b/third_party/gmp/mpn/x86/gcd_11.asm
new file mode 100644
index 0000000..af69135
--- /dev/null
+++ b/third_party/gmp/mpn/x86/gcd_11.asm

@@ -0,0 +1,126 @@
+dnl  x86 mpn_gcd_11 optimised for processors with slow BSF.
+
+dnl  Based on C version.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl  Rudimentary code for x86-32, i.e. for CPUs without cmov.  Also, the bsf
+dnl  instruction is assumed to be so slow it is useless.  Instead a teble is
+dnl  used.
+dnl
+dnl  The loop benefits from OoO, in-order CPUs might want a different loop.
+dnl  The ebx and ecx registers could be combined if the assigment of ecx were
+dnl  postponed until ebx died, but that would at least hurt in-order CPUs.
+
+C	     cycles/bit (approx)
+C AMD K7	 ?
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 ?
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4-2	 ?
+C Intel P4-3/4	 ?
+C Intel P6/13	 ?
+C Intel CNR	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0',    `%eax')
+define(`v0',    `%edx')
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	push	%edi
+	push	%esi
+	push	%ebx
+
+	mov	16(%esp), u0
+	mov	20(%esp), v0
+	LEAL(	ctz_table, %esi)
+	sub	v0, u0			C u = u - v		0
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	sbb	%ebx, %ebx		C mask			1
+	mov	u0, %edi		C			1
+	mov	u0, %ecx		C			1
+	and	%ebx, %edi		C			2
+	xor	%ebx, u0		C			2
+	add	%edi, v0		C v = min(u.v)		3
+	sub	%ebx, u0		C u = |u - v|		3
+L(mid):	and	$MASK, %ecx		C			2
+	movzbl	(%esi,%ecx), %ecx	C			3
+	jz	L(shift_alot)
+	shr	%cl, u0			C			4
+	sub	v0, u0			C u = u - v		0,5
+	jnz	L(top)
+
+L(end):	mov	v0, %eax
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+
+L(shift_alot):
+	shr	$MAXSHIFT, u0
+	mov	u0, %ecx
+	jmp	L(mid)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/geode/gmp-mparam.h b/third_party/gmp/mpn/x86/geode/gmp-mparam.h
new file mode 100644
index 0000000..cc9c9f1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/geode/gmp-mparam.h

@@ -0,0 +1,141 @@
+/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2002, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2011-01-30, gcc 3.4 */
+
+#define MOD_1_NORM_THRESHOLD                 6
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         17
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        14
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */
+#define USE_PREINV_DIVREM_1                  0
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           42
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                66
+#define MUL_TOOM44_THRESHOLD               105
+#define MUL_TOOM6H_THRESHOLD               141
+#define MUL_TOOM8H_THRESHOLD               212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      62
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      67
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 33
+#define SQR_TOOM3_THRESHOLD                 60
+#define SQR_TOOM4_THRESHOLD                136
+#define SQR_TOOM6_THRESHOLD                196
+#define SQR_TOOM8_THRESHOLD                292
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define MUL_FFT_MODF_THRESHOLD             468  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    468, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 61
+#define MUL_FFT_THRESHOLD                 5504
+
+#define SQR_FFT_MODF_THRESHOLD             396  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    396, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    159, 8}, \
+    {    319,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 61
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  37
+#define MULLO_MUL_N_THRESHOLD            10950
+
+#define DC_DIV_QR_THRESHOLD                 59
+#define DC_DIVAPPR_Q_THRESHOLD             189
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                136
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               183
+#define INV_APPR_THRESHOLD                 181
+
+#define BINV_NEWTON_THRESHOLD              204
+#define REDC_1_TO_REDC_N_THRESHOLD          54
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               81
+#define MU_BDIV_QR_THRESHOLD               889
+#define MU_BDIV_Q_THRESHOLD                998
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                     133
+#define GCD_DC_THRESHOLD                   451
+#define GCDEXT_DC_THRESHOLD                318
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                15
+#define GET_STR_PRECOMPUTE_THRESHOLD        30
+#define SET_STR_DC_THRESHOLD               547
+#define SET_STR_PRECOMPUTE_THRESHOLD      1049

diff --git a/third_party/gmp/mpn/x86/gmp-mparam.h b/third_party/gmp/mpn/x86/gmp-mparam.h
new file mode 100644
index 0000000..2cb1984
--- /dev/null
+++ b/third_party/gmp/mpn/x86/gmp-mparam.h

@@ -0,0 +1,38 @@
+/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* Generic x86 mpn_divexact_1 is faster than generic x86 mpn_divrem_1 on all
+   of p5, p6, k6 and k7, so use it always.  It's probably slower on 386 and
+   486, but that's too bad.  */
+#define DIVEXACT_1_THRESHOLD  0

diff --git a/third_party/gmp/mpn/x86/goldmont/gmp-mparam.h b/third_party/gmp/mpn/x86/goldmont/gmp-mparam.h
new file mode 100644
index 0000000..3d37fa3
--- /dev/null
+++ b/third_party/gmp/mpn/x86/goldmont/gmp-mparam.h

@@ -0,0 +1,219 @@
+/* Intel Goldmont/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 7
+#define MOD_1_UNNORM_THRESHOLD              12
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 32.79% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             32
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           23
+
+#define DIV_1_VS_MUL_1_PERCENT             228
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               193
+#define MUL_TOOM6H_THRESHOLD               286
+#define MUL_TOOM8H_THRESHOLD               399
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     125
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     137
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     185
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                113
+#define SQR_TOOM4_THRESHOLD                280
+#define SQR_TOOM6_THRESHOLD                399
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             60
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             368  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    368, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     47,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     39, 8}, {     79, 9}, {     47,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,10}, {    303, 9}, \
+    {    607,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    703,10}, \
+    {   1407,11}, {    735,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,10}, {   2431,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    831,11}, {   1663,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,12}, \
+    {   3839,15}, {    511,14}, {   1023,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,12}, {   7679,15}, \
+    {   1023,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2559,13}, {   5119,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define MUL_FFT_TABLE3_SIZE 171
+#define MUL_FFT_THRESHOLD                 3712
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
+    {     79, 9}, {    159, 8}, {    319,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303, 8}, {    607, 9}, {    319,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303, 9}, {    607,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415, 9}, {    831,11}, \
+    {    223,10}, {    479,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    831,11}, \
+    {   1663,12}, {    959,11}, {   1919,14}, {    255,13}, \
+    {    511,12}, {   1215,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3839,12}, {   7679,15}, {   1023,14}, \
+    {   2047,13}, {   4095,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 170
+#define SQR_FFT_THRESHOLD                 3520
+
+#define MULLO_BASECASE_THRESHOLD             5
+#define MULLO_DC_THRESHOLD                  50
+#define MULLO_MUL_N_THRESHOLD             6633
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  95
+#define SQRLO_SQR_THRESHOLD               6633
+
+#define DC_DIV_QR_THRESHOLD                 68
+#define DC_DIVAPPR_Q_THRESHOLD             204
+#define DC_BDIV_QR_THRESHOLD                64
+#define DC_BDIV_Q_THRESHOLD                108
+
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               276
+#define INV_APPR_THRESHOLD                 226
+
+#define BINV_NEWTON_THRESHOLD              298
+#define REDC_1_TO_REDC_N_THRESHOLD          65
+
+#define MU_DIV_QR_THRESHOLD               1528
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD              140
+#define MU_BDIV_QR_THRESHOLD              1334
+#define MU_BDIV_Q_THRESHOLD               1499
+
+#define POWM_SEC_TABLE  3,16,96,428,1317
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        18
+#define SET_STR_DC_THRESHOLD               704
+#define SET_STR_PRECOMPUTE_THRESHOLD      1358
+
+#define FAC_DSC_THRESHOLD                   95
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD2_DIV1_METHOD                    1  /* 5.53% faster than 3 */
+#define HGCD_THRESHOLD                     172
+#define HGCD_APPR_THRESHOLD                204
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   610
+#define GCDEXT_DC_THRESHOLD                443
+#define JACOBI_BASE_METHOD                   4  /* 6.53% faster than 3 */
+
+/* Tuneup completed successfully, took 101563 seconds */

diff --git a/third_party/gmp/mpn/x86/i486/gmp-mparam.h b/third_party/gmp/mpn/x86/i486/gmp-mparam.h
new file mode 100644
index 0000000..aa7dbad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/i486/gmp-mparam.h

@@ -0,0 +1,69 @@
+/* 80486 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2001-2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* 100MHz DX4 */
+
+/* Generated by tuneup.c, 2003-02-13, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD             18
+#define MUL_TOOM33_THRESHOLD            228
+
+#define SQR_BASECASE_THRESHOLD           13
+#define SQR_TOOM2_THRESHOLD              49
+#define SQR_TOOM3_THRESHOLD             238
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIV_DC_THRESHOLD                 72
+#define POWM_THRESHOLD                   38
+
+#define GCD_ACCEL_THRESHOLD               3
+#define JACOBI_BASE_METHOD                2
+
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  0
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD         17
+
+#define GET_STR_DC_THRESHOLD             32
+#define GET_STR_PRECOMPUTE_THRESHOLD     82
+#define SET_STR_THRESHOLD              3524
+
+#define MUL_FFT_TABLE  { 464, 928, 1920, 4608, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          392
+#define MUL_FFT_THRESHOLD              2816
+
+#define SQR_FFT_TABLE  { 432, 928, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          392
+#define SQR_FFT_THRESHOLD              2816

diff --git a/third_party/gmp/mpn/x86/k10/gmp-mparam.h b/third_party/gmp/mpn/x86/k10/gmp-mparam.h
new file mode 100644
index 0000000..eceaaae
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k10/gmp-mparam.h

@@ -0,0 +1,217 @@
+/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014-2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3200-3600 MHz K10 Thuban */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         14
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     22
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 29.33% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           35
+
+#define DIV_1_VS_MUL_1_PERCENT             258
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               124
+#define MUL_TOOM6H_THRESHOLD               274
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     113
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 26
+#define SQR_TOOM3_THRESHOLD                105
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                238
+#define SQR_TOOM8_THRESHOLD                309
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    570, 5}, {     21, 6}, {     11, 5}, {     25, 6}, \
+    {     13, 5}, {     27, 6}, {     15, 5}, {     31, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471, 9}, \
+    {   2943,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,10}, {   2943,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 168
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             525  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    525, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    127,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671, 9}, {   1343,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    799, 9}, \
+    {   1599,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,10}, {   1727,12}, {    447,11}, {    959,10}, \
+    {   1919,11}, {    991,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    959,11}, {   1919,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2495,13}, {   1279,12}, {   2623,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4351,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4351,14}, \
+    {   2303,13}, {   4991,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 166
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             6
+#define MULLO_DC_THRESHOLD                  40
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                 113
+#define SQRLO_SQR_THRESHOLD              10323
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             248
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                158
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               254
+#define INV_APPR_THRESHOLD                 252
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1558
+#define MUPI_DIV_QR_THRESHOLD              114
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1524
+
+#define POWM_SEC_TABLE  1,16,102,416,1378
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD      1105
+
+#define FAC_DSC_THRESHOLD                  159
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    3  /* 0.70% faster than 4 */
+#define HGCD_THRESHOLD                     130
+#define HGCD_APPR_THRESHOLD                163
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   573
+#define GCDEXT_DC_THRESHOLD                393
+#define JACOBI_BASE_METHOD                   4  /* 9.13% faster than 1 */
+
+/* Tuneup completed successfully, took 52901 seconds */

diff --git a/third_party/gmp/mpn/x86/k6/README b/third_party/gmp/mpn/x86/k6/README
new file mode 100644
index 0000000..1d65af3
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/README

@@ -0,0 +1,251 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+			AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx subdirectory has MMX code suiting plain K6, the k62mmx subdirectory
+has MMX code suiting K6-2 and K6-3.  All chips in the K6 family have MMX,
+the separate directories are just so that ./configure can omit them if the
+assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+                                 cycles/limb
+
+	mpn_add_n/sub_n            3.25 normal, 2.75 in-place
+
+	mpn_mul_1                  6.25
+	mpn_add/submul_1           7.65-8.4  (varying with data values)
+
+	mpn_mul_basecase           9.25 cycles/crossproduct (approx)
+	mpn_sqr_basecase           4.7  cycles/crossproduct (approx)
+                                   or 9.2 cycles/triangleproduct (approx)
+
+	mpn_l/rshift               3.0
+
+	mpn_divrem_1              20.0
+	mpn_mod_1                 20.0
+	mpn_divexact_by3          11.0
+
+	mpn_copyi                  1.0
+	mpn_copyd                  1.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+	mpn_l/rshift               1.75
+
+
+Prefetching of sources hasn't yet given any joy.  With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster.  Results so far are inconsistent.  The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and in both together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling.  An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.  Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations.  This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available.  When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+  short decode is inhibited.  The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+  2 instructions before a 32 byte boundary.  This makes use of the 16 byte
+  cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector.  0(%esi) doesn't have this
+  problem, and can be used as an equivalent, or easier is just to use a
+  different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem.  (K6-2 CXT and K6-3
+  have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+  If more than 3 bytes are needed to determine instruction length then
+  decoding degrades from direct to long, or from long to vector.  This
+  happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+  with mod=00 the sib determines whether there's a displacement.
+
+  This affects all MMX and 3DNow instructions, and others with an 0F prefix,
+  like movzbl.  The modes affected are anything with an index and no
+  displacement, or an index but no base, and this includes (%esp) which is
+  really (,%esp,1).
+
+  The cross.pl script detects problem cases.  The workaround is to always
+  use a displacement, and to do this with Zdisp if it's zero so the
+  assembler doesn't discard it.
+
+  See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+  13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+  cycles.
+
+Various
+
+- adcl      2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf       12-27 cycles
+- emms      5 cycles
+- femms     3 cycles
+- jecxz     2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl      20 cycles back-to-back
+- imull     2 decode, 3 execute
+- mull      2 decode, 3 execute (optimization manual decoding sample)
+- prefetch  2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+            immediate or %cl count: 11 + 2 per bit for dword
+                                    13 + 4 per bit for byte
+- setCC	    2 cycles
+- xchgl	%eax,reg  1.5 cycles, back-to-back (strange)
+        reg,reg   2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000.  This describes K6-2 and
+K6-3.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997.  This is an
+older edition of the above document, describing plain K6.  Available
+on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928G/0-March 2000.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general K6 optimizations as well as
+3DNow.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/k6/aors_n.asm b/third_party/gmp/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000..168f9b4
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/aors_n.asm

@@ -0,0 +1,337 @@
+dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                      mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE,  `FRAME+16(%esp)')
+define(PARAM_SRC2,  `FRAME+12(%esp)')
+define(PARAM_SRC1,  `FRAME+8(%esp)')
+define(PARAM_DST,   `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	L(start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+	xorl	%eax, %eax
+L(start):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC1, %ebx
+	pushl	%edi
+FRAME_pushl()
+
+	movl	PARAM_SRC2, %edx
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	jae	L(unroll)
+
+
+	shrl	%eax		C initial carry flag
+
+	C offset 0x21 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C The store to (%edi) could be done with a stosl; it'd be smaller
+	C code, but there's no speed gain and a cld would have to be added
+	C (per mpn/x86/README).
+
+	movl	(%ebx), %eax
+	leal	4(%ebx), %ebx
+
+	M4_inst	(%edx), %eax
+
+	movl	%eax, (%edi)
+	leal	4(%edi), %edi
+
+	leal	4(%edx), %edx
+	loop	L(simple)
+
+
+	movl	$0, %eax
+	popl	%edi
+
+	setc	%al
+
+	popl	%ebx
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+	C eax	carry
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	cmpl	%edi, %ebx
+	pushl	%esi
+
+	je	L(inplace)
+
+ifdef(`OPERATION_add_n',`
+	cmpl	%edi, %edx
+
+	je	L(inplace_reverse)
+')
+
+	movl	%ecx, %esi
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+	shrl	%eax
+
+	ALIGN(32)
+L(normal_top):
+	C eax	counter, qwords, negative
+	C ebx	src1
+	C ecx	scratch
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+	M4_inst	-20(%edx,%ecx,4), %eax
+	movl	%eax, -20(%edi,%ecx,4)
+
+	movl	4-20(%ebx,%ecx,4), %eax
+	M4_inst	4-20(%edx,%ecx,4), %eax
+	movl	%eax, 4-20(%edi,%ecx,4)
+
+	movl	8-20(%ebx,%ecx,4), %eax
+	M4_inst	8-20(%edx,%ecx,4), %eax
+	movl	%eax, 8-20(%edi,%ecx,4)
+
+	movl	12-20(%ebx,%ecx,4), %eax
+	M4_inst	12-20(%edx,%ecx,4), %eax
+	movl	%eax, 12-20(%edi,%ecx,4)
+
+	loop	L(normal_top)
+
+
+	decl	%esi
+	jz	L(normal_finish_one)
+	js	L(normal_done)
+
+	C two or three more limbs
+
+	movl	(%ebx), %eax
+	M4_inst	(%edx), %eax
+	movl	%eax, (%edi)
+
+	movl	4(%ebx), %eax
+	M4_inst	4(%edx), %eax
+	decl	%esi
+	movl	%eax, 4(%edi)
+
+	jz	L(normal_done)
+	movl	$2, %ecx
+
+L(normal_finish_one):
+	movl	(%ebx,%ecx,4), %eax
+	M4_inst	(%edx,%ecx,4), %eax
+	movl	%eax, (%edi,%ecx,4)
+
+L(normal_done):
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+	C dst==src2
+
+	movl	%ebx, %edx
+')
+
+L(inplace):
+	C eax	initial carry
+	C ebx
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	leal	-1(%ecx), %esi
+	decl	%ecx
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+	movl	(%edx), %ebx		C src low limb
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	shrl	%eax
+
+
+	ALIGN(32)
+L(inplace_top):
+	C eax
+	C ebx	next src limb
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	M4_inst	%ebx, (%edi,%ecx,4)
+
+	movl	4(%edx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+
+	M4_inst	%eax, 4-20(%edi,%ecx,4)
+
+	movl	8-20(%edx,%ecx,4), %eax
+	movl	12-20(%edx,%ecx,4), %ebx
+
+	M4_inst	%eax, 8-20(%edi,%ecx,4)
+	M4_inst	%ebx, 12-20(%edi,%ecx,4)
+
+	movl	16-20(%edx,%ecx,4), %ebx
+	loop	L(inplace_top)
+
+
+	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+	M4_inst	%ebx, (%edi)
+
+	decl	%esi
+	jz	L(inplace_finish_one)
+	js	L(inplace_done)
+
+	C two or three more limbs
+
+	movl	4(%edx), %eax
+	movl	8(%edx), %ebx
+	M4_inst	%eax, 4(%edi)
+	M4_inst	%ebx, 8(%edi)
+
+	decl	%esi
+	movl	$2, %ecx
+
+	jz	L(normal_done)
+
+L(inplace_finish_one):
+	movl	4(%edx,%ecx,4), %eax
+	M4_inst	%eax, 4(%edi,%ecx,4)
+
+L(inplace_done):
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/aorsmul_1.asm b/third_party/gmp/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000..eaa92eb
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/aorsmul_1.asm

@@ -0,0 +1,391 @@
+dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl  Copyright 1999-2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12		 5.94
+C P6 model 9  (Banias)		 5.51
+C P6 model 13 (Dothan)		 5.57
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6			7.65-8.5 (data dependent)
+C AMD K7
+C AMD K8
+
+
+dnl  K6:           large multipliers  small multipliers
+dnl  UNROLL_COUNT    cycles/limb       cycles/limb
+dnl        4             9.5              7.78
+dnl        8             9.0              7.78
+dnl       16             8.4              7.65
+dnl       32             8.4              8.2
+dnl
+dnl  Maximum possible unrolling with the current code is 32.
+dnl
+dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl  byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data.  Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help.  It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C Attempts at a simpler main loop, with less unrolling, haven't yielded much
+C success, generally running over 9 c/l.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C                                         decode cycles
+C		mull	%ebp                    2
+C		M4_inst	%esi, disp(%edi)        1
+C		adcl	%eax, %ecx              2
+C		movl	%edx, %esi            \ 1
+C		jnc	1f                    /
+C		incl	%esi                  \ 1
+C	1:	movl	disp(%ebx), %eax      /
+C                                              ---
+C                                               7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted).  This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Don't really know why.
+C
+C It's not clear how much branch misprediction might be costing.  The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C	x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
+C	--- * sum sum --- = --- * ------- * ------- = 1/4
+C       b^2   x=0 y=1 b^2   b^4      2         2
+C
+C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %ebx
+	pushl	%edi
+deflit(`FRAME',12)
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_DST, %edi
+
+	pushl	%ebp
+deflit(`FRAME',16)
+	jae	L(unroll)
+
+
+	C simple loop
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	$4, %edi
+	addl	%esi, %eax
+
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	loop	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%ebx
+	movl	%esi, %eax
+
+	popl	%esi
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
+C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP,    `PARAM_DST')
+
+L(unroll):
+	C eax
+	C ebx	src
+	C ecx	size
+	C edx
+	C esi	initial carry
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	shll	$4, %edx
+	negl	%ecx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%ebx), %eax		C src low limb
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%esi, %eax	C initial carry (from _1c)
+	jadcl0(	%edx)
+
+
+	leal	4(%ebx,%ecx,4), %ebx
+	movl	%edx, %esi	C high carry
+
+	movl	VAR_JUMP, %edx
+	leal	(%edi,%ecx,4), %edi
+
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	jz	L(noswap)
+	movl	%esi, %ecx	C high,low carry other way around
+
+	movl	%eax, %esi
+L(noswap):
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret_internal
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	src
+	C ecx	carry lo
+	C edx	scratch
+	C esi	carry hi
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes per limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+	deflit(`disp0', eval(2*i*4))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0(	%ecx)
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+	M4_inst	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0(	%esi)
+')
+
+	decl	VAR_COUNTER
+
+	leal	UNROLL_BYTES(%ebx), %ebx
+	jns	L(top)
+
+
+	popl	%ebp
+	M4_inst	%ecx, UNROLL_BYTES(%edi)
+
+	popl	%edi
+	movl	%esi, %eax
+
+	popl	%ebx
+	jadcl0(	%eax)
+
+	popl	%esi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/cross.pl b/third_party/gmp/mpn/x86/k6/cross.pl
new file mode 100755
index 0000000..fc921a5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/cross.pl

@@ -0,0 +1,182 @@
+#! /usr/bin/perl
+
+# Copyright 2000, 2001 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# Code alignments of 8 bytes or more are handled.  When 32 is used, cache
+# line boundaries will fall in at offsets 0x20,0x40,etc and problems are
+# flagged at those locations.  When 16 is used, the line boundaries can also
+# fall at offsets 0x10,0x30,0x50,etc, depending where the file is loaded, so
+# problems are identified there too.  Likewise when 8 byte alignment is used
+# problems are flagged additionally at 0x08,0x18,0x28,etc.
+#
+# Usually 32 byte alignment is used for k6 routines, but less is certainly
+# possible if through good luck, or a little tweaking, cache line crossing
+# problems can be avoided at the extra locations.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions.  All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that's easy to avoid when coding.
+#
+# Future:
+#
+# Warn about jump targets that are poorly aligned (less than 2 instructions
+# before a cache line boundary).
+
+use strict;
+
+sub disassemble {
+    my ($file) = @_;
+    my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+    my $align;
+
+    open (IN, "objdump -Srfh $file |")
+	|| die "Cannot open pipe from objdump\n";
+    while (<IN>) {
+	print;
+
+	if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+	    $align = 1 << $1;
+	    if ($align < 8) {
+		print "ZZ cross.pl cannot handle alignment < 2**3\n";
+		$align = 8
+	    }
+	}
+
+	if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+	} else {
+	    next;
+	}
+
+	if ($b1 =~ /0f/) {
+	    $prefix = $b1;
+	    $opcode = $b2;
+	    $modrm = $b3;
+	} else {
+	    $prefix = '';
+	    $opcode = $b1;
+	    $modrm = $b2;
+	}
+
+	# modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+	# for K6 and pre-CXT K6-2
+	if ($prefix =~ /0f/
+	    && $opcode !~ /^8/         # jcond disp32
+	    && $modrm =~ /^[0-3][4c]/) {
+	    print "ZZ ($file) >3 bytes to determine instruction length [K6]\n";
+	}
+
+	# with just an opcode, starting 1f mod 20h
+	if (($align==32 && $addr =~ /[13579bdf]f$/
+	     || $align==16 && $addr =~ /f$/
+	     || $align==8 && $addr =~ /[7f]$/)
+	    && $prefix !~ /0f/
+	    && $opcode !~ /1[012345]/ # adc
+	    && $opcode !~ /1[89abcd]/ # sbb
+	    && $opcode !~ /^4/        # inc/dec reg
+	    && $opcode !~ /^5/        # push/pop reg
+	    && $opcode !~ /68/        # push $imm32
+	    && $opcode !~ /^7/        # jcond disp8
+	    && $opcode !~ /a[89]/     # test+imm
+	    && $opcode !~ /a[a-f]/    # stos/lods/scas
+	    && $opcode !~ /b8/        # movl $imm32,%eax
+	    && $opcode !~ /d[0123]/   # rcl
+	    && $opcode !~ /e[0123]/   # loop/loopz/loopnz/jcxz
+	    && $opcode !~ /e8/        # call disp32
+	    && $opcode !~ /e[9b]/     # jmp disp32/disp8
+	    && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+	    && !($opcode =~ /f[67]/          # grp 1
+		 && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything starting at 1f mod 20h
+	if (($align==32 && $addr =~ /[13579bdf][f]$/
+	     || $align==16 && $addr =~ /f$/
+	     || $align==8 && $addr =~ /[7f]$/)
+	    && $prefix =~ /0f/
+	    && $opcode !~ /af/        # imul
+	    && $opcode !~ /a[45]/     # shldl
+	    && $opcode !~ /a[cd]/     # shrdl
+	    ) {
+	    print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+	if (($align==32 && $addr =~ /[13579bdf][e]$/
+	     || $align==16 && $addr =~ /[e]$/
+	     || $align==8 && $addr =~ /[6e]$/)
+	    && $prefix =~ /0f/
+	     && $opcode !~ /^8/        # jcond disp32
+	     && $opcode !~ /af/        # imull reg,reg
+	     && $opcode !~ /a[45]/     # shldl
+	     && $opcode !~ /a[cd]/     # shrdl
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+	}
+    }
+    close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+    @files = @ARGV;
+} else {
+    @files = glob "*.asm";
+    map {s/.asm/.o/} @files;
+}
+
+foreach (@files)  {
+    disassemble($_);
+}

diff --git a/third_party/gmp/mpn/x86/k6/divrem_1.asm b/third_party/gmp/mpn/x86/k6/divrem_1.asm
new file mode 100644
index 0000000..b4cea4f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/divrem_1.asm

@@ -0,0 +1,203 @@
+dnl  AMD K6 mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 20 cycles/limb
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                          mp_limb_t carry);
+C
+C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
+C instead of decl+jnz, since it comes out 2 cycles/limb faster.
+C
+C A test is done to see if the high limb is less than the divisor, and if so
+C one less div is done.  A div is 20 cycles, so assuming high<divisor about
+C half the time, then this test saves half that amount.  The branch
+C misprediction penalty is less than that.
+C
+C Back-to-back div instructions run at 20 cycles, the same as the loop here,
+C so it seems there's nothing to gain by rearranging the loop.  Pairing the
+C mov and loop instructions was found to gain nothing.
+C
+C Enhancements:
+C
+C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
+C that algorithm has been found to suffer from the relatively poor carry
+C handling on K6 and too many auxiliary instructions.  The fractional part
+C however could be done at about 13 c/l, if it mattered enough.
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	orl	%ecx, %ecx		C size
+
+	movl	PARAM_CARRY, %edx
+	jz	L(fraction)		C if size==0
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jmp	L(integer_top)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx,%ecx		C size
+
+	jz	L(size_zero)
+	pushl	%ebx		FRAME_pushl()
+
+	movl	-4(%edi,%ecx,4), %eax	C src high limb
+	xorl	%edx, %edx
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	cmpl	%esi, %eax
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jae	L(integer_entry)
+
+
+	C high<divisor, so high of dst is zero, and avoid one div
+
+	movl	%edx, (%ebx,%ecx,4)
+	decl	%ecx
+
+	movl	%eax, %edx
+	jz	L(fraction)
+
+
+L(integer_top):
+	C eax	scratch (quotient)
+	C ebx	dst+4*xsize-4
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi	src
+	C ebp	xsize
+
+	movl	-4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+	divl	%esi
+
+	movl	%eax, (%ebx,%ecx,4)
+	loop	L(integer_top)
+
+
+L(fraction):
+	orl	%ebp, %ecx
+	jz	L(done)
+
+	movl	PARAM_DST, %ebx
+
+
+L(fraction_top):
+	C eax	scratch (quotient)
+	C ebx	dst
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	xorl	%eax, %eax
+
+	divl	%esi
+
+	movl	%eax, -4(%ebx,%ecx,4)
+	loop	L(fraction_top)
+
+
+L(done):
+	popl	%ebp
+	movl	%edx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+	movl	PARAM_XSIZE, %ecx
+	xorl	%eax, %eax
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README
+
+	rep
+	stosl
+
+	popl	%esi
+	popl	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/gmp-mparam.h b/third_party/gmp/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000..f03f1b2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/gmp-mparam.h

@@ -0,0 +1,166 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2004, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* 450MHz K6-2 */
+
+#define MOD_1_NORM_THRESHOLD                12
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         41
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         32
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         3
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD    128
+#define USE_PREINV_DIVREM_1                  0
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                69
+#define MUL_TOOM44_THRESHOLD               106
+#define MUL_TOOM6H_THRESHOLD               157
+#define MUL_TOOM8H_THRESHOLD               199
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      64
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                143
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                272
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     11, 5}, {     23, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     23, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     17, 6}, \
+    {     35, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    167,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191, 9}, {    383,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287,11}, {    159,10}, {    351,11}, {    191,10}, \
+    {    415, 9}, {    831,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    575,12}, {    319,11}, {    703,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    895,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    703,13}, {    383,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             432  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    432, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     24, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 7}, {     93, 8}, {     47, 7}, \
+    {     95, 8}, {     51,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     71, 8}, \
+    {    143, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159, 9}, {    319,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    703,12}, {    383,11}, \
+    {    831,12}, {    447,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    703,13}, \
+    {    383,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 112
+#define SQR_FFT_THRESHOLD                 7040
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  60
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 78
+#define DC_DIVAPPR_Q_THRESHOLD             252
+#define DC_BDIV_QR_THRESHOLD                84
+#define DC_BDIV_Q_THRESHOLD                171
+
+#define INV_MULMOD_BNM1_THRESHOLD           55
+#define INV_NEWTON_THRESHOLD               234
+#define INV_APPR_THRESHOLD                 236
+
+#define BINV_NEWTON_THRESHOLD              268
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1308
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD              134
+#define MU_BDIV_QR_THRESHOLD              1164
+#define MU_BDIV_Q_THRESHOLD               1164
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     182
+#define GCD_DC_THRESHOLD                   591
+#define GCDEXT_DC_THRESHOLD                472
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                24
+#define GET_STR_PRECOMPUTE_THRESHOLD        40
+#define SET_STR_DC_THRESHOLD               834
+#define SET_STR_PRECOMPUTE_THRESHOLD      2042

diff --git a/third_party/gmp/mpn/x86/k6/k62mmx/copyd.asm b/third_party/gmp/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000..f80a5a1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/k62mmx/copyd.asm

@@ -0,0 +1,118 @@
+dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.0 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
+C cycle startup time, which amounts for instance to a 2x speedup at 15
+C limbs.
+C
+C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
+C processing one limb separately to make it aligned.  This and a final odd
+C limb are handled in a branch-free fashion, ending up re-copying if the
+C special case isn't needed.
+C
+C Alternatives:
+C
+C There used to be a big unrolled version of this, running at 0.56 c/l if
+C the destination was aligned, but that seemed rather excessive for the
+C relative importance of copyd.
+C
+C If the destination alignment is ignored and just left to run at 1.17 c/l
+C some code size and a fixed few cycles can be saved.  Considering how few
+C uses copyd finds perhaps that should be favoured.  The current code has
+C the attraction of being no slower than a basic rep movsl though.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  re-using parameter space
+define(SAVE_EBX,`PARAM_SIZE')
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	subl	$1, %ecx		C better code alignment than decl
+	jb	L(zero)
+
+	jz	L(one_more)
+	leal	4(%edx,%ecx,4), %ebx
+
+Zdisp(	movd,	0,(%eax,%ecx,4), %mm0)	C high limb
+Zdisp(	movd,	%mm0, 0,(%edx,%ecx,4))	C Zdisp for good code alignment
+
+	cmpl	$1, %ecx
+	je	L(one_more)
+
+	shrl	$2, %ebx
+	andl	$1, %ebx		C 1 if dst[size-2] unaligned
+
+	subl	%ebx, %ecx
+	nop				C code alignment
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter
+	C edx	dst
+
+	movq	-4(%eax,%ecx,4), %mm0
+	subl	$2, %ecx
+
+	movq	%mm0, 4(%edx,%ecx,4)
+	ja	L(top)
+
+
+L(one_more):
+	movd	(%eax), %mm0
+	movd	%mm0, (%edx)
+
+	movl	SAVE_EBX, %ebx
+	emms_or_femms
+L(zero):
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/k62mmx/lshift.asm b/third_party/gmp/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000..c86575f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/k62mmx/lshift.asm

@@ -0,0 +1,294 @@
+dnl  AMD K6-2 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl  minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	movl	%edx, VAR_RETVAL
+	jae	L(unroll)
+
+
+	movd	%ecx, %mm7
+	movl	%eax, %ecx
+
+	movl	PARAM_DST, %eax
+
+L(simple):
+	C eax	dst
+	C ebx	src
+	C ecx	counter, size-1 to 1
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%ecx,4), %mm0
+
+	psrlq	%mm7, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
+	loop	L(simple)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+	psllq	%mm6, %mm0
+
+	movd	%mm0, (%eax)
+	movl	%edx, %eax
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval (but instead VAR_RETVAL is used)
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	movl	PARAM_DST, %edx
+
+	movd	%ecx, %mm7
+	subl	$7, %eax			C size-8
+
+	leal	(%edx,%eax,4), %ecx		C alignment of dst
+
+	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
+	testb	$4, %cl
+
+	jz	L(dst_aligned)
+	psllq	%mm6, %mm2
+
+	psrlq	$32, %mm2
+	decl	%eax
+
+	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
+	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
+L(dst_aligned):
+
+	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of size-8 lets the loop stop when %eax goes negative and
+	C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, size-8 step by -4 until <0
+	C ebx	src
+	C ecx
+	C edx	dst
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psllq	%mm6, %mm2
+	subl	$4, %eax
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	24(%ebx,%eax,4), %mm0
+
+	psllq	%mm6, %mm1
+	movq	%mm2, 40(%edx,%eax,4)
+
+	movq	%mm0, %mm2
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm1, 32(%edx,%eax,4)
+	jnc	L(top)
+
+
+	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+	C
+	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+	C %eax is between -4 and -1, representing respectively 0 to 3 extra
+	C limbs that must be read.
+
+
+	testl	$2, %eax	C testl to avoid bad cache line crossing
+	jz	L(finish_nottwo)
+
+	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+	C new mm2 and a new mm0 is loaded.
+
+	psllq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psrlq	%mm7, %mm0
+	subl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm2, 32(%edx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+	testb	$1, %al
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psllq	%mm6, %mm1
+
+	movq	%mm2, 24(%edx,%eax,4)
+	jz	L(finish_even)
+
+
+	C Size is odd, so mm1 and one extra limb to process.
+
+	movd	(%ebx), %mm0		C src[0]
+	popl	%ebx
+deflit(`FRAME',0)
+
+	movq	%mm0, %mm2
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	psllq	%mm6, %mm2
+	por	%mm0, %mm1
+
+	movq	%mm1, 4(%edx)		C dst[1,2]
+	movd	%mm2, (%edx)		C dst[0]
+
+	movl	VAR_RETVAL, %eax
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+	C Size is even, so only mm1 left to process.
+
+	movq	%mm1, (%edx)		C dst[0,1]
+	movl	VAR_RETVAL, %eax
+
+	popl	%ebx
+	femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/k62mmx/rshift.asm b/third_party/gmp/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000..f604a7b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/k62mmx/rshift.asm

@@ -0,0 +1,293 @@
+dnl  AMD K6-2 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+
+
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	movl	PARAM_DST, %ecx
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+	C This loop runs at about 3 cycles/limb, which is the amount of
+	C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+	C eax	counter, -(size-1) to -1
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	subl	$7, %eax		C size-8
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+	movq	(%ebx), %mm2		C src low qword
+	leal	(%ebx,%eax,4), %ebx	C src end - 32
+
+	testb	$4, %cl
+	leal	(%ecx,%eax,4), %ecx	C dst end - 32
+
+	notl	%eax			C -(size-7)
+	jz	L(dst_aligned)
+
+	psrlq	%mm6, %mm2
+	incl	%eax
+
+Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
+	movq	4(%ebx,%eax,4), %mm2	C new src low qword
+L(dst_aligned):
+
+	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
+	nop	C avoid bad cache line crossing
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+	C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, -(size-7) step by +4 until >=0
+	C ebx	src end - 32
+	C ecx	dst end - 32
+	C edx	retval
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psrlq	%mm6, %mm2
+	addl	$4, %eax
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	4(%ebx,%eax,4), %mm0
+
+	psrlq	%mm6, %mm1
+	movq	%mm2, -12(%ecx,%eax,4)
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm1, -4(%ecx,%eax,4)
+	ja	L(top)		C jump if no carry and not zero
+
+
+
+	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+	C to 3 representing respectively 3 to 0 further limbs.
+
+	testl	$2, %eax	C testl to avoid bad cache line crossings
+	jnz	L(finish_nottwo)
+
+	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+	C becomes new mm2 and a new mm0 is loaded.
+
+	psrlq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	addl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm2, -4(%ecx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psrlq	%mm6, %mm1
+
+	movq	%mm2, 4(%ecx,%eax,4)
+	jnz	L(finish_even)
+
+
+	C one further extra limb to process
+
+	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
+	popl	%ebx
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	psrlq	%mm6, %mm2
+
+	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
+	movd	%mm2, 32-4(%ecx)	C dst[size-1]
+
+	movl	%edx, %eax		C retval
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+	C no further extra limbs
+
+	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
+	movl	%edx, %eax		C retval
+
+	popl	%ebx
+
+	femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/com.asm b/third_party/gmp/mpn/x86/k6/mmx/com.asm
new file mode 100644
index 0000000..b747454
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/com.asm

@@ -0,0 +1,103 @@
+dnl  AMD K6-2 mpn_com -- mpn bitwise one's complement.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C    alignment dst/src, A=0mod8 N=4mod8
+C       A/A   A/N   N/A   N/N
+C K6-2  1.0   1.18  1.18  1.18  cycles/limb
+C K6    1.5   1.85  1.75  1.85
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	shrl	%ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+	notl_or_xorl_GMP_NUMB_MASK(	%eax)
+	movl	%eax, (%edx)
+	ret
+
+
+L(two_or_more):
+	pushl	%ebx	FRAME_pushl()
+	pcmpeqd	%mm7, %mm7		C all ones
+
+	movl	%ecx, %ebx
+ifelse(GMP_NAIL_BITS,0,,
+`	psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
+
+
+
+	ALIGN(8)
+L(top):
+	C eax	src
+	C ebx	floor(size/2)
+	C ecx	counter
+	C edx	dst
+	C
+	C mm0	scratch
+	C mm7	mask
+
+	movq	-8(%eax,%ecx,8), %mm0
+	pxor	%mm7, %mm0
+	movq	%mm0, -8(%edx,%ecx,8)
+	loop	L(top)
+
+
+	jnc	L(no_extra)
+	movl	(%eax,%ebx,8), %eax
+	notl_or_xorl_GMP_NUMB_MASK(	%eax)
+	movl	%eax, (%edx,%ebx,8)
+L(no_extra):
+
+	popl	%ebx
+	emms_or_femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/dive_1.asm b/third_party/gmp/mpn/x86/k6/mmx/dive_1.asm
new file mode 100644
index 0000000..1bbad3a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/dive_1.asm

@@ -0,0 +1,282 @@
+dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C K6:   10.0  12.0  cycles/limb
+C K6-2: 10.0  11.5
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C A simple divl is used for size==1.  This is about 10 cycles faster for an
+C odd divisor or 20 cycles for an even divisor.
+C
+C The loops are quite sensitive to code alignment, speeds should be
+C rechecked (odd and even divisor, pic and non-pic) if contemplating
+C changing anything.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_SRC, %eax
+	xorl	%edx, %edx
+
+	cmpl	$1, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	PARAM_DST, %ecx
+	movl	%eax, (%ecx)
+
+	ret
+
+
+L(two_or_more):
+	movl	PARAM_DIVISOR, %eax
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+L(strip_twos):
+	shrl	%eax
+	incl	%edx			C will get shift+1
+
+	jnc	L(strip_twos)
+	pushl	%esi		FRAME_pushl()
+
+	leal	1(%eax,%eax), %esi	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ebp)
+Zdisp(	movzbl,	0,(%eax,%ebp), %eax)
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+	pushl	%edi		FRAME_pushl()
+
+	leal	(%eax,%eax), %ebp	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_DST, %edi
+
+	imull	%esi, %eax		C inv*inv*d
+
+	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
+	leal	(%ebp,%ebp), %eax	C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	movl	%esi, PARAM_DIVISOR	C d without twos
+	leal	(%ebx,%ecx,4), %ebx	C src end
+
+	imull	%esi, %ebp		C inv*inv*d
+
+	leal	(%edi,%ecx,4), %edi	C dst end
+	negl	%ecx			C -size
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	subl	$1, %edx		C shift amount, and clear carry
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	jnz	L(even)
+
+	movl	(%ebx,%ecx,4), %esi	C src low limb
+	jmp	L(odd_entry)
+
+
+	ALIGN(16)
+	nop	C code alignment
+L(odd_top):
+	C eax	scratch
+	C ebx	src end
+	C ecx	counter, limbs, negative
+	C edx	inverse
+	C esi	next limb, adjusted for carry
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	imull	%edx, %esi
+
+	movl	PARAM_DIVISOR, %eax
+	movl	%esi, -4(%edi,%ecx,4)
+
+	mull	%esi			C carry limb in edx
+
+	subl	%ebp, %edx		C apply carry bit
+	movl	(%ebx,%ecx,4), %esi
+
+L(odd_entry):
+	subl	%edx, %esi		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebp, %ebp		C 0 or -1
+
+	incl	%ecx
+	jnz	L(odd_top)
+
+
+	imull	%edx, %esi
+
+	movl	%esi, -4(%edi,%ecx,4)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+
+L(even):
+	C eax
+	C ebx	src end
+	C ecx	-size
+	C edx	twos
+	C esi
+	C edi	dst end
+	C ebp
+
+	xorl	%ebp, %ebp
+Zdisp(	movq,	0,(%ebx,%ecx,4), %mm0)	C src[0,1]
+
+	movd	%edx, %mm7
+	movl	VAR_INVERSE, %edx
+
+	addl	$2, %ecx
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi
+	jz	L(even_two)		C if only two limbs
+
+
+C Out-of-order execution is good enough to hide the load/rshift/movd
+C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
+C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
+C been found.  Maybe the fact every second movq is unaligned costs the extra
+C 0.5.
+
+L(even_top):
+	C eax	scratch
+	C ebx	src end
+	C ecx	counter, limbs, negative
+	C edx	inverse
+	C esi	next limb, adjusted for carry
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+	C
+	C mm0	scratch, source limbs
+	C mm7	twos
+
+	imull	%edx, %esi
+
+	movl	%esi, -8(%edi,%ecx,4)
+	movl	PARAM_DIVISOR, %eax
+
+	mull	%esi			C carry limb in edx
+
+	movq	-4(%ebx,%ecx,4), %mm0
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi
+	subl	%ebp, %edx		C apply carry bit
+
+	subl	%edx, %esi		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebp, %ebp		C 0 or -1
+
+	incl	%ecx
+	jnz	L(even_top)
+
+
+L(even_two):
+	movd	-4(%ebx), %mm0		C src high limb
+	psrlq	%mm7, %mm0
+
+	imull	%edx, %esi
+
+	movl	%esi, -8(%edi)
+	movl	PARAM_DIVISOR, %eax
+
+	mull	%esi			C carry limb in edx
+
+	movd	%mm0, %esi
+	subl	%ebp, %edx		C apply carry bit
+
+	movl	VAR_INVERSE, %eax
+	subl	%edx, %esi		C apply carry limb
+
+	imull	%eax, %esi
+
+	movl	%esi, -4(%edi)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	emms_or_femms
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/logops_n.asm b/third_party/gmp/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000..e17930b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/logops_n.asm

@@ -0,0 +1,226 @@
+dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C         alignment dst/src1/src2, A=0mod8, N=4mod8
+C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
+C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
+C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
+C
+C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
+C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
+C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
+
+
+dnl  M4_p and M4_i are the MMX and integer instructions
+dnl  M4_*_neg_dst means whether to negate the final result before writing
+dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function',  `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p',         `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i',         `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl  style (the two are equivalent for xor).
+dnl
+dnl  pandn can't be used with nails.
+
+M4_choose_op( and_n,  pand,0,0,  andl,0,0)
+ifelse(GMP_NAIL_BITS,0,
+`M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
+`M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
+M4_choose_op( nand_n, pand,1,0,  andl,1,0)
+M4_choose_op( ior_n,  por,0,0,   orl,0,0)
+M4_choose_op( iorn_n, por,0,1,   orl,0,1)
+M4_choose_op( nior_n, por,1,0,   orl,1,0)
+M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                   mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones.  The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C           alignment dst/src1/src2, A=0mod8, N=4mod8
+C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
+C
+C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
+C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
+C K6                 2.0    2.25                2.35   2.28   nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(M4_function)
+			movl	PARAM_SIZE, %ecx
+			pushl	%ebx		FRAME_pushl()
+
+			movl	PARAM_SRC1, %eax
+
+			movl	PARAM_SRC2, %ebx
+			cmpl	$1, %ecx
+
+			movl	PARAM_DST, %edx
+			ja	L(two_or_more)
+
+
+			movl	(%ebx), %ecx
+			popl	%ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			movl	%ecx, (%edx)
+
+			ret
+
+
+L(two_or_more):
+			C eax	src1
+			C ebx	src2
+			C ecx	size
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+
+			pushl	%esi		FRAME_pushl()
+			testl	$4, %eax
+			jz	L(alignment_ok)
+
+			movl	(%ebx), %esi
+			addl	$4, %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%esi)')
+			M4_i	(%eax), %esi
+			addl	$4, %eax
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%esi)')
+			movl	%esi, (%edx)
+			addl	$4, %edx
+			decl	%ecx
+
+L(alignment_ok):
+			movl	%ecx, %esi
+			shrl	%ecx
+			jnz	L(still_two_or_more)
+
+			movl	(%ebx), %ecx
+			popl	%esi
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			popl	%ebx
+			movl	%ecx, (%edx)
+			ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+			pcmpeqd	%mm7, %mm7		C all ones
+ifelse(GMP_NAIL_BITS,0,,`psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
+')
+
+			ALIGN(16)
+L(top):
+			C eax	src1
+			C ebx	src2
+			C ecx	counter
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			movq	-8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
+			M4_p	-8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
+			movq	%mm0, -8(%edx,%ecx,8)
+
+			loop	L(top)
+
+
+			jnc	L(no_extra)
+
+			movl	-4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
+			M4_i	-4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
+			movl	%ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+			popl	%esi
+			popl	%ebx
+			emms_or_femms
+			ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/lshift.asm b/third_party/gmp/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000..45be582
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/lshift.asm

@@ -0,0 +1,130 @@
+dnl  AMD K6 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+	nop		C avoid bad cache line crossing
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+L(top):
+	C eax	counter, size-1 to 1
+	C ebx	src
+	C ecx	dst
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%ecx,%eax,4)
+	jnz	L(top)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+	psllq	%mm6, %mm0
+	movl	%edx, %eax
+
+	movd	%mm0, (%ecx)
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/popham.asm b/third_party/gmp/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000..2b19d0b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/popham.asm

@@ -0,0 +1,236 @@
+dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C        popcount  hamdist
+C K6-2:    9.0       11.5   cycles/limb
+C K6:      12.5      13.0
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	RODATA
+	ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+L(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+
+L(rodata_000000FF000000FF):
+	.long	0x000000FF
+	.long	0x000000FF
+')
+
+	TEXT
+	ALIGN(32)
+
+POP(`ifdef(`PIC', `
+	C avoid shrl crossing a 32-byte boundary
+	nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+	movl	$0x000000FF, %edx
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+	punpckldq %mm4, %mm4
+',`
+
+	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	L(rodata_3333333333333333), %mm6
+	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
+	movq	L(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
+	pxor	%mm0, %mm1
+')
+
+	incl	%ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+HAM(`	nop			C code alignment')
+
+	psubd	%mm1, %mm0	C bit pairs
+HAM(`	nop			C code alignment')
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+	movq	%mm0, %mm1
+	psrlq	$8, %mm0
+
+
+	paddb	%mm1, %mm0	C words
+
+
+	movq	%mm0, %mm1
+	psrlq	$16, %mm0
+
+	paddd	%mm1, %mm0	C dwords
+
+	pand	REG_000000FF000000FF, %mm0
+
+	paddd	%mm0, %mm2	C low to total
+	psrlq	$32, %mm0
+
+	paddd	%mm0, %mm2	C high to total
+	loop	L(top)
+
+
+
+	movd	%mm2, %eax
+	emms_or_femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/rshift.asm b/third_party/gmp/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000..cd0382f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/rshift.asm

@@ -0,0 +1,130 @@
+dnl  AMD K6 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx	C 32-shift
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx	C retval
+	movl	PARAM_DST, %ecx
+
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+
+L(simple):
+	C eax	counter (negative)
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mod_34lsub1.asm b/third_party/gmp/mpn/x86/k6/mod_34lsub1.asm
new file mode 100644
index 0000000..7e30503
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mod_34lsub1.asm

@@ -0,0 +1,190 @@
+dnl  AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 2.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C An attempt was made to use a loop like
+C
+C L(top):
+C	adcl	(%edx), %eax
+C	adcl	4(%edx), %ebx
+C	adcl	8(%edx), %esi
+C	leal	12(%edx), %edx
+C	loop	L(top)
+C
+C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
+C The form used instead can save about 6 cycles by not dividing by 3.
+C
+C In the code used, putting the "leal"s at the top of the loop is necessary
+C for the claimed speed, anywhere else costs an extra cycle per loop.
+C Perhaps a tight loop like this needs short decode instructions at the
+C branch target, which would explain the leal/loop form above taking 8
+C cycles instead of 7 too.
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SIZE')
+define(SAVE_ESI, `PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %eax
+	ja	L(three_or_more)
+
+Zdisp(	movl,	0,(%edx), %eax)		C avoid code cache line boundary
+	jne	L(one)
+
+	movl	%eax, %ecx
+	movl	4(%edx), %edx
+
+	shrl	$24, %eax		C src[0] high
+	andl	$0x00FFFFFF, %ecx	C src[0] low
+
+	addl	%ecx, %eax
+	movl	%edx, %ecx
+
+	shll	$8, %edx
+	andl	$0x00FFFF00, %edx	C src[1] high
+
+	shrl	$16, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+	addl	%edx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax	size-2
+	C ebx
+	C ecx
+	C edx	src
+
+	movl	%ebx, SAVE_EBX
+	xorl	%ebx, %ebx
+
+	movl	%esi, SAVE_ESI
+	pushl	%edi	FRAME_pushl()
+
+	xorl	%esi, %esi
+	xorl	%edi, %edi		C and clear carry flag
+
+L(top):
+	C eax	counter, limbs
+	C ebx	acc 0mod3
+	C ecx
+	C edx	src, incrementing
+	C esi	acc 1mod3
+	C edi	acc 2mod3
+	C ebp
+
+	leal	-2(%eax), %eax
+	leal	12(%edx), %edx
+
+	adcl	-12(%edx), %ebx
+	adcl	-8(%edx), %esi
+	adcl	-4(%edx), %edi
+
+	decl	%eax
+	jg	L(top)
+
+
+	C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
+
+	movb	$0, %cl
+	incl	%eax
+
+	js	L(combine)		C 0 more
+
+Zdisp(	adcl,	0,(%edx), %ebx)		C avoid code cache line crossings
+
+	movb	$8, %cl
+	decl	%eax
+
+	js	L(combine)		C 1 more
+
+	adcl	4(%edx), %esi
+
+	movb	$16, %cl
+
+
+L(combine):
+	sbbl	%edx, %edx
+
+	shll	%cl, %edx		C carry
+	movl	%ebx, %eax		C 0mod3
+
+	shrl	$24, %eax		C 0mod3 high
+	andl	$0x00FFFFFF, %ebx	C 0mod3 low
+
+	subl	%edx, %eax		C apply carry
+	movl	%esi, %ecx		C 1mod3
+
+	shrl	$16, %esi		C 1mod3 high
+	addl	%ebx, %eax		C apply 0mod3 low
+
+	andl	$0x0000FFFF, %ecx
+	addl	%esi, %eax		C apply 1mod3 high
+
+	shll	$8, %ecx		C 1mod3 low
+	movl	%edi, %edx		C 2mod3
+
+	shrl	$8, %edx		C 2mod3 high
+	addl	%ecx, %eax		C apply 1mod3 low
+
+	addl	%edx, %eax		C apply 2mod3 high
+	andl	$0x000000FF, %edi
+
+	shll	$16, %edi		C 2mod3 low
+	movl	SAVE_EBX, %ebx
+
+	addl	%edi, %eax		C apply 2mod3 low
+	movl	SAVE_ESI, %esi
+
+	popl	%edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mode1o.asm b/third_party/gmp/mpn/x86/k6/mode1o.asm
new file mode 100644
index 0000000..4a338bd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mode1o.asm

@@ -0,0 +1,176 @@
+dnl  AMD K6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2003, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 10.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C A special case for high<divisor at the end measured only about 4 cycles
+C faster, and so isn't used.
+C
+C A special case for size==1 using a divl rather than the inverse measured
+C only about 5 cycles faster, and so isn't used.  When size==1 and
+C high<divisor it can skip a division and be a full 24 cycles faster, but
+C this isn't an important case.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_CARRY, %edx
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	xorl	%edx, %edx
+L(start_1c):
+	pushl	%edi		FRAME_pushl()
+
+	shrl	%ecx			C d/2
+	movl	PARAM_DIVISOR, %esi
+
+	andl	$127, %ecx		C d/2, 7 bits
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edi)
+Zdisp(	movzbl,	0,(%ecx,%edi), %edi)		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%ecx), %edi	C inv 8 bits
+')
+	leal	(%edi,%edi), %ecx	C 2*inv
+
+	imull	%edi, %edi		C inv*inv
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ebp
+
+	imull	%esi, %edi		C inv*inv*d
+
+	pushl	%ebx		FRAME_pushl()
+	leal	(%eax,%ebp,4), %ebx	C src end
+
+	subl	%edi, %ecx		C inv = 2*inv - inv*inv*d
+	leal	(%ecx,%ecx), %edi	C 2*inv
+
+	imull	%ecx, %ecx		C inv*inv
+
+	movl	(%eax), %eax		C src low limb
+	negl	%ebp			C -size
+
+	imull	%esi, %ecx		C inv*inv*d
+
+	subl	%ecx, %edi		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax
+	movl	%esi, %eax
+	imull	%edi, %eax
+	cmpl	$1, %eax
+	popl	%eax')
+
+	jmp	L(entry)
+
+
+C Rotating the mul to the top of the loop saves 1 cycle, presumably by
+C hiding the loop control under the imul latency.
+C
+C The run time is 10 cycles, but decoding is only 9 (and the dependent chain
+C only 8).  It's not clear how to get down to 9 cycles.
+C
+C The xor and rcl to handle the carry bit could be an sbb instead, with the
+C the carry bit add becoming a sub, but that doesn't save anything.
+
+L(top):
+	C eax	(low product)
+	C ebx	src end
+	C ecx	carry bit, 0 or 1
+	C edx	(high product, being carry limb)
+	C esi	divisor
+	C edi	inverse
+	C ebp	counter, limbs, negative
+
+	mull	%esi
+
+	movl	(%ebx,%ebp,4), %eax
+	addl	%ecx, %edx		C apply carry bit to carry limb
+
+L(entry):
+	xorl	%ecx, %ecx
+	subl	%edx, %eax		C apply carry limb
+
+	rcll	%ecx
+
+	imull	%edi, %eax
+
+	incl	%ebp
+	jnz	L(top)
+
+
+
+	popl	%ebx
+	popl	%ebp
+
+	mull	%esi
+
+	popl	%edi
+	popl	%esi
+
+	leal	(%ecx,%edx), %eax
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k6/mul_1.asm b/third_party/gmp/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000..3ef7ec2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mul_1.asm

@@ -0,0 +1,292 @@
+dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
+
+dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12		 5.5
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 4.87
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6			 6.25
+C AMD K7
+C AMD K8
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	L(start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	mov	PARAM_SIZE, %ecx
+	push	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	push	%edi
+FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+FRAME_pushl()
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jae	L(unroll)
+
+
+	C code offset 0x22 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+	C
+	C this loop 8 cycles/limb
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(simple)
+
+
+	popl	%ebp
+
+	popl	%edi
+	popl	%ebx
+
+	movl	%esi, %eax
+	popl	%esi
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end.  Rotating the load/store to the top
+C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+	ALIGN(16)	C already aligned to 16 here actually
+L(unroll):
+	movl	(%ebx), %eax
+	leal	-16(%ebx,%ecx,4), %ebx
+
+	leal	-16(%edi,%ecx,4), %edi
+	subl	$4, %ecx
+
+	negl	%ecx
+
+
+	ALIGN(16)	C one byte nop for this alignment
+L(top):
+	C eax	scratch
+	C ebx	&src[size-4]
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	&dst[size-4]
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	12(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 12(%edi,%ecx,4)
+	movl	16(%ebx,%ecx,4), %eax
+
+
+	addl	$4, %ecx
+	js	L(top)
+
+
+
+	C eax	next src limb
+	C ebx	&src[size-4]
+	C ecx	0 to 3 representing respectively 4 to 1 further limbs
+	C edx
+	C esi	carry
+	C edi	&dst[size-4]
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+	addl	$2, %ecx
+L(finish_not_two):
+
+
+	testb	$1, %cl
+	jnz	L(finish_not_one)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi)
+	movl	12(%ebx), %eax
+L(finish_not_one):
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	popl	%ebp
+
+	adcl	$0, %edx
+
+	movl	%eax, 12(%edi)
+	popl	%edi
+
+	popl	%ebx
+	movl	%edx, %eax
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mul_basecase.asm b/third_party/gmp/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000..7030001
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mul_basecase.asm

@@ -0,0 +1,612 @@
+dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+C     unrolling).
+
+
+
+dnl  K6: UNROLL_COUNT cycles/product (approx)
+dnl           8           9.75
+dnl          16           9.3
+dnl          32           9.3
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl  might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once.  The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Enhancements:
+C
+C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
+C c/l.  Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
+C
+C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
+C extra stack setups and maybe 2 or 3 wasted cycles at the end.  Code saving
+C would be 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two_limbs)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	movl	(%edx), %edx	C xp low limb
+	movl	PARAM_WP, %ecx
+
+	mull	%edx
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+	decl	PARAM_YSIZE
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	%eax, %ecx	C yp low limb
+	movl	(%edx), %eax	C xp low limb
+
+	movl	%edx, %esi	C xp
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+	movl	%esi, 4(%ebx)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi
+deflit(`FRAME',12)
+	movl	%eax, (%ebx)
+
+	movl	4(%esi), %eax
+	movl	%edx, %edi	C carry, for wp[1]
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+
+	movl	%edi, 4(%ebx)
+	movl	4(%ecx), %ecx	C yp[1]
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+
+	movl	(%esi), %eax	C xp[0]
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	adcl	$0, %esi
+
+	movl	%edi, 8(%ebx)
+	popl	%edi
+
+	movl	%esi, 12(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',0)
+
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+	pushl	%ebp		defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_WP, %edi
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	%eax, %ebp
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	leal	(%edx,%ecx,4), %ebx	C xp end
+	xorl	%esi, %esi
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)		C final carry
+
+	movl	PARAM_XSIZE, %ecx
+	decl	%edx
+
+	jnz	L(ysize_more_than_one_limb)
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+L(ysize_more_than_one_limb):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop.  The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+	C eax	yp
+	C ebx	xp end
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1
+	C ebp
+
+	movl	4(%eax), %ebp		C multiplier
+	negl	%ecx
+
+	movl	%ecx, PARAM_XSIZE	C -xsize
+	xorl	%esi, %esi		C initial carry
+
+	leal	4(%eax,%edx,4), %eax	C yp end
+	negl	%edx
+
+	movl	%eax, PARAM_YP
+	movl	%edx, PARAM_YSIZE
+
+	jmp	L(simple_outer_entry)
+
+
+	C aligning here saves a couple of cycles
+	ALIGN(16)
+L(simple_outer_top):
+	C edx	ysize counter, negative
+
+	movl	PARAM_YP, %eax		C yp end
+	xorl	%esi, %esi		C carry
+
+	movl	PARAM_XSIZE, %ecx	C -xsize
+	movl	%edx, PARAM_YSIZE
+
+	movl	(%eax,%edx,4), %ebp	C yp limb multiplier
+L(simple_outer_entry):
+	addl	$4, %edi
+
+
+L(simple_inner):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of this addmul
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	%edx, %esi
+
+	incl	%ecx
+	jnz	L(simple_inner)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)
+
+	incl	%edx
+	jnz	L(simple_outer_top)
+
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.  This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER,      -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP,          -28)
+defframe(VAR_XP_LOW,       -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl  slower in this code, hence the defframe empty-if-zero feature is
+dnl  disabled.
+dnl
+dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
+dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl  anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+	C eax	yp (not used)
+	C ebx	xp end (not used)
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1 (not used)
+	C ebp
+deflit(`FRAME', 16)
+
+	leal	-2(%ecx), %ebp	C one limb processed at start,
+	decl	%ecx		C and ebp is one less
+
+	shrl	$UNROLL_LOG2, %ebp
+	negl	%ecx
+
+	subl	$VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ecx, %esi
+	shll	$4, %ecx
+
+	movl	%ebp, VAR_COUNTER_INIT
+	negl	%esi
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+	movl	PARAM_XP, %ebx
+	movl	%ebp, VAR_COUNTER
+
+	movl	PARAM_WP, %edi
+	movl	%ecx, VAR_JMP
+
+	movl	(%ebx), %eax
+	leal	4(%edi,%esi,4), %edi	C wp adjust for unrolling and mul1
+
+	leal	(%ebx,%esi,4), %ebx	C xp adjust for unrolling
+
+	movl	%eax, VAR_XP_LOW
+
+	movl	%ebx, PARAM_XP
+	movl	PARAM_YP, %ebx
+
+	leal	(%edi,%edx,4), %ecx	C wp adjust for ysize indexing
+	movl	4(%ebx), %ebp		C multiplier (yp second limb)
+
+	leal	4(%ebx,%edx,4), %ebx	C yp adjust for ysize indexing
+
+	movl	%ecx, PARAM_WP
+
+	leal	1(%esi), %ecx	C adjust parity for decl %ecx above
+
+	movl	%ebx, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%ecx,%esi,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
+	C cost any extra space, since the inner unrolled loop below is
+	C aligned to 32.
+	ALIGN(32)
+L(unroll_outer_top):
+	C edx	ysize
+
+	movl	PARAM_YP, %eax
+	movl	%edx, PARAM_YSIZE	C incremented ysize counter
+
+	movl	PARAM_WP, %edi
+
+	movl	VAR_COUNTER_INIT, %ebx
+	movl	(%eax,%edx,4), %ebp	C next multiplier
+
+	movl	PARAM_XSIZE, %ecx
+	leal	(%edi,%edx,4), %edi	C adjust wp for where we are in yp
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	C using testb is a tiny bit faster than testl
+	testb	$1, %cl
+
+	movl	%eax, %ecx	C low carry
+	movl	VAR_JMP, %eax
+
+	movl	%edx, %esi	C high carry
+	movl	PARAM_XP, %ebx
+
+	jnz	L(unroll_noswap)
+	movl	%ecx, %esi	C high,low carry other way around
+
+	movl	%edx, %ecx
+L(unroll_noswap):
+
+	jmp	*%eax
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	scratch
+	C ebx	xp
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	wp
+	C ebp	multiplier
+	C VAR_COUNTER  loop counter
+	C
+	C 15 code bytes each limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4))
+	deflit(`disp1', eval(disp0 + 4))
+	deflit(`disp2', eval(disp1 + 4))
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+
+	movl	disp2(%ebx), %eax
+	mull	%ebp
+	addl	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(unroll_top)
+
+
+	movl	PARAM_YSIZE, %edx
+	addl	%ecx, UNROLL_BYTES(%edi)
+
+	adcl	$0, %esi
+
+	incl	%edx
+	movl	%esi, UNROLL_BYTES+4(%edi)
+
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	addl	$FRAME, %esp
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/pre_mod_1.asm b/third_party/gmp/mpn/x86/k6/pre_mod_1.asm
new file mode 100644
index 0000000..34db20d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/pre_mod_1.asm

@@ -0,0 +1,146 @@
+dnl  AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
+
+dnl  Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 18.0 cycles/limb
+
+
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                             mp_limb_t inverse);
+C
+C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
+C considered worthwhile (just).
+
+defframe(PARAM_INVERSE,16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+	ASSERT(ae,`cmpl $1, PARAM_SIZE')
+	ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_SRC, %ebp
+	pushl	%edi	FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %eax
+	pushl	%esi	FRAME_pushl()
+
+	movl	-4(%ebp,%ecx,4), %esi	C src high limb
+	pushl	%ebx	FRAME_pushl()
+
+	movl	%edx, %edi		C first n2 to cancel
+	subl	%eax, %esi		C first n1 = high-divisor
+
+	decl	%ecx
+	jz	L(done_sbbl)
+
+L(top):
+	C eax	scratch
+	C ebx	n10, nadj, q1
+	C ecx	counter, size to 1
+	C edx	scratch
+	C esi	n2
+	C edi	old high, for underflow test
+	C ebp	src
+
+	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
+
+L(entry):
+	andl	PARAM_DIVISOR, %edi
+L(q1_ff_top):
+	movl	-4(%ebp,%ecx,4), %ebx
+
+	addl	%esi, %edi	    C possible addback
+	movl	%ebx, %esi	    C n10
+
+	sarl	$31, %ebx	    C -n1 = 0 or -1
+	movl	%edi, %eax	    C n2
+
+	movl	PARAM_INVERSE, %edx
+	subl	%ebx, %eax	    C n2+n1
+
+	mull	%edx		    C m*(n2+n1)
+
+	andl	PARAM_DIVISOR, %ebx C -n1 & d
+	addl	%esi, %ebx	    C nadj = n10 + (-n1&d), ignoring overflow
+
+	addl	%ebx, %eax	    C low m*(n2+n1) + nadj, giving carry flag
+	leal	1(%edi), %ebx	    C n2+1
+
+	adcl	%ebx, %edx	    C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
+
+	movl	PARAM_DIVISOR, %eax C d
+	jz	L(q1_ff)
+
+	mull	%edx		    C (q1+1)*d
+
+	subl	%eax, %esi	    C low  n-(q1+1)*d
+	loop	L(top)
+
+
+
+L(done_sbbl):
+	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
+
+	andl	PARAM_DIVISOR, %edi
+L(done_esi_edi):
+	popl	%ebx
+
+	leal	(%esi,%edi), %eax
+	popl	%esi
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d.  This is rarely
+C reached.
+
+L(q1_ff):
+	movl	PARAM_DIVISOR, %edi
+	loop	L(q1_ff_top)
+
+	jmp	L(done_esi_edi)
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/sqr_basecase.asm b/third_party/gmp/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000..b7ecb5c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/sqr_basecase.asm

@@ -0,0 +1,680 @@
+dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+C     product (measured on the speed difference between 17 and 33 limbs,
+C     which is roughly the Karatsuba recursing range).
+
+
+dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
+dnl  code supports.  This value is used only by the tune program to know
+dnl  what it can go up to.  (An attempt to compile with a bigger value will
+dnl  trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl  The value is determined by requiring the displacements in the unrolled
+dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
+dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+
+
+dnl  Allow a value from the tune program to override config.m4.
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+
+dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
+dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
+dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
+dnl
+dnl  The first addmul is the biggest, and this takes the second least
+dnl  significant limb and multiplies it by the third least significant and
+dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
+dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %ecx
+	je	L(two_limbs)
+
+	movl	PARAM_DST, %edx
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+deflit(`FRAME',4)
+
+	movl	(%ebx), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0]*src[1]
+
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	popl	%ebx
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+
+	movl	(%ebx), %eax
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+	pushl	%esi
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	pushl	%edi
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 20(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+
+	movl	%edx, %edi
+	movl	8(%ebx), %edx
+
+	pushl	%ebp
+	xorl	%ebp, %ebp
+
+	mull	%edx		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	4(%ebx), %eax
+
+	adcl	%edx, %ebp
+
+	movl	8(%ebx), %edx
+
+	mull	%edx		C src[1] * src[2]
+
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+
+	C eax	will be dst[5]
+	C ebx
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	xorl	%eax, %eax
+	addl	%esi, %esi
+	adcl	%edi, %edi
+	adcl	%ebp, %ebp
+	adcl	%edx, %edx
+	adcl	$0, %eax
+
+	addl	%esi, 4(%ecx)
+	adcl	%edi, 8(%ecx)
+	adcl	%ebp, 12(%ecx)
+
+	popl	%ebp
+	popl	%edi
+
+	adcl	%edx, 16(%ecx)
+
+	popl	%esi
+	popl	%ebx
+
+	adcl	%eax, 20(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX,   -4)
+defframe(SAVE_ESI,   -8)
+defframe(SAVE_EDI,   -12)
+defframe(SAVE_EBP,   -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(STACK_SPACE, 24)
+
+	ALIGN(16)
+L(four_or_more):
+
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	leal	4(%edx), %edi
+
+	movl	%ebx, SAVE_EBX
+	leal	4(%eax), %ebx
+
+	movl	%esi, SAVE_ESI
+	xorl	%esi, %esi
+
+	movl	%ebp, SAVE_EBP
+
+	C eax
+	C ebx	src+4
+	C ecx	size
+	C edx
+	C esi
+	C edi	dst+4
+	C ebp
+
+	movl	(%eax), %ebp	C multiplier
+	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
+
+
+	ALIGN(16)
+L(mul_1):
+	C eax	scratch
+	C ebx	src ptr
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst ptr
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time.  The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl  This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31.  Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+	C eax
+	C ebx	&src[size]
+	C ecx
+	C edx
+	C esi	carry
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %ebx')
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %edi')
+
+	negl	%ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limitation UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+	C
+	ASSERT(ae,`
+	movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx
+	')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	&src[size], constant
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	high limb to store
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+	testb	$1, %cl
+
+	movl	%edx, %esi	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	movl	%edx, VAR_JMP
+	leal	4(%edi), %edi
+
+	C A branch-free version of this using some xors was found to be a
+	C touch slower than just a conditional jump, despite the jump
+	C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+	jz,jnz)	L(unroll_noswap)
+	movl	%esi, %eax	C high,low carry other way around
+
+	movl	%ecx, %esi
+	movl	%eax, %ecx
+L(unroll_noswap):
+
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/esi should start.
+	C
+	C An attempt was made at padding here to get the end of the unrolled
+	C code to come out on a good alignment, to save padding before
+	C L(corner).  This worked, but turned out to run slower than just an
+	C ALIGN(2).  The reason for this is not clear, it might be related
+	C to the different speeds on different UNROLL_COUNTs noted above.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	src
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%esi, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+	addl	%esi, -4+OFFSET(%edi)
+
+	movl	VAR_COUNTER, %edx
+	jadcl0(	%ecx)
+
+	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %ebx
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C ebx	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%ebx), %ebp
+
+	movl	-8(%ebx), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	adcl	$0, %edx
+
+	movl	-4(%ebx), %eax
+	movl	%edx, %esi
+	movl	%eax, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	adcl	$0, %edx
+
+	addl	%eax, (%edi)
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	movl	%ebx, %eax
+
+	mull	%ecx
+
+	addl	%esi, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_DST, %edi
+	subl	$1, %ecx		C size-1 and clear carry
+
+	movl	PARAM_SRC, %ebx
+	movl	%ecx, %edx
+
+	xorl	%eax, %eax		C ready for adcl
+
+
+	ALIGN(16)
+L(lshift):
+	C eax
+	C ebx	src (for later use)
+	C ecx	counter, decrementing
+	C edx	size-1 (for later use)
+	C esi
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+	leal	8(%edi), %edi
+	loop	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%ebx), %eax		C src[0]
+
+	leal	4(%ebx,%edx,4), %ebx	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+	ALIGN(16)
+L(diag):
+	C eax	scratch
+	C ebx	&src[size]
+	C ecx	counter, negative
+	C edx	carry
+	C esi	scratch
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	movl	%edx, %esi
+
+	mull	%eax
+
+	addl	%esi, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret_internal
+')
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/README b/third_party/gmp/mpn/x86/k7/README
new file mode 100644
index 0000000..5711b61
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/README

@@ -0,0 +1,174 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+                      AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions.  All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+                               cycles/limb
+	mpn_add/sub_n             1.6
+
+	mpn_copyi                 0.75 or 1.0   \ varying with data alignment
+	mpn_copyd                 0.75 or 1.0   /
+
+	mpn_divrem_1             17.0 integer part, 15.0 fractional part
+	mpn_mod_1                17.0
+	mpn_divexact_by3          8.0
+
+	mpn_l/rshift              1.2
+
+	mpn_mul_1                 3.4
+	mpn_addmul/submul_1       3.9
+
+	mpn_mul_basecase          4.42 cycles/crossproduct (approx)
+        mpn_sqr_basecase          2.3 cycles/crossproduct (approx)
+				  or 4.55 cycles/triangleproduct (approx)
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles.  This suggests 3 is a limit on
+the speed of the multiplication routines.  The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0.  A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling.  An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies.  K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl       direct
+divl       39 cycles back-to-back
+lodsl,etc  vector
+loop       1 cycle vector (decl/jnz opens up one decode slot)
+movd reg   vector
+movd mem   direct
+mull       issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl	   vector (use movl for more than one pop)
+pushl	   direct, will pair with a load
+shrdl %cl  vector, 3 cycles, seems to be 3 decode too
+xorl r,r   false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision K, February 2002.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928G/0-March 2000.
+This describes the femms and prefetch instructions.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision D, March 2000.  This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general Athlon optimizations as well as
+3DNow.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/k7/addlsh1_n.asm b/third_party/gmp/mpn/x86/k7/addlsh1_n.asm
new file mode 100644
index 0000000..2cba1eb
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/addlsh1_n.asm

@@ -0,0 +1,196 @@
+dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
+C The innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 5.4	(worse than add_n + lshift)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 6
+C AMD K6			 ?
+C AMD K7			 2.5
+C AMD K8
+
+C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
+C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
+C that means we need an initial magic multiply.
+C
+C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
+C cannot do rsblsh1_n since we feed carry from the shift blocks to the
+C add/subtract blocks, which is right for addition but reversed for
+C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
+C without losing any time, since we're not issue limited but carry recurrency
+C latency.
+C
+C Breaking carry recurrency might be a good idea.  We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force us to 2*2-way unrolling.
+
+defframe(PARAM_SIZE,	16)
+defframe(PARAM_DBLD,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+dnl  re-use parameter space
+define(VAR_COUNT,`PARAM_DST')
+define(VAR_TMP,`PARAM_DBLD')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+
+	mov	$0x2aaaaaab, %eax
+
+	push	%ebx			FRAME_pushl()
+	mov	PARAM_SIZE, %ebx	C size
+
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+
+	mul	%ebx
+
+	push	up			FRAME_pushl()
+	mov	PARAM_SRC, up
+
+	not	%edx			C count = -(size\8)-1
+	mov	%edx, VAR_COUNT
+
+	push	vp			FRAME_pushl()
+	mov	PARAM_DBLD, vp
+
+	lea	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
+	xor	%edx, %edx
+	lea	(%ebx,%ecx,2), %ebx	C size + (count*3+3)*2 = size % 6
+	or	%ebx, %ebx
+	jz	L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(vp), %eax
+	adc	%eax, %eax
+	rcr	%edx			C restore 1st saved carry bit
+	lea	4(vp), vp
+	adc	(up), %eax
+	lea	4(up), up
+	adc	%edx, %edx		C save a carry bit in edx
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	dec	%ebx
+	mov	%eax, (rp)
+	lea	4(rp), rp
+	jnz	L(oop)
+	mov	vp, VAR_TMP
+L(exact):
+	incl	VAR_COUNT
+	jz	L(end)
+
+	ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(vp), %eax
+	adc	%eax, %eax
+	mov	4(vp), %ebx
+	adc	%ebx, %ebx
+	mov	8(vp), %ecx
+	adc	%ecx, %ecx
+
+	rcr	%edx			C restore 1st saved carry bit
+
+	adc	(up), %eax
+	mov	%eax, (rp)
+	adc	4(up), %ebx
+	mov	%ebx, 4(rp)
+	adc	8(up), %ecx
+	mov	%ecx, 8(rp)
+
+	mov	12(vp), %eax
+	adc	%eax, %eax
+	mov	16(vp), %ebx
+	adc	%ebx, %ebx
+	mov	20(vp), %ecx
+	adc	%ecx, %ecx
+
+	lea	24(vp), vp
+	adc	%edx, %edx		C save a carry bit in edx
+
+	adc	12(up), %eax
+	mov	%eax, 12(rp)
+	adc	16(up), %ebx
+	mov	%ebx, 16(rp)
+	adc	20(up), %ecx
+
+	lea	24(up), up
+
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	mov	%ecx, 20(rp)
+	incl	VAR_COUNT
+	lea	24(rp), rp
+	jne	L(top)
+
+L(end):
+	pop	vp			FRAME_popl()
+	pop	up			FRAME_popl()
+
+ifdef(`CPU_P6',`
+	xor	%eax, %eax
+	shr	$1, %edx
+	adc	%edx, %eax
+',`
+	adc	$0, %edx
+	mov	%edx, %eax
+')
+	pop	rp			FRAME_popl()
+	pop	%ebx			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/aors_n.asm b/third_party/gmp/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000..1a08072
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/aors_n.asm

@@ -0,0 +1,258 @@
+dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.64 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           1.9
+dnl          16           1.64
+dnl          32           1.7
+dnl          64           2.0
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                         mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                   mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which might be the best possible with
+C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
+C which can be done each cycle, leading to 1.5 c/l.
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+	TEXT
+	ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	L(start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+	xorl	%eax, %eax	C carry
+L(start):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_SRC2, %edx
+	movl	PARAM_SRC1, %ebx
+	jae	L(unroll)
+
+	movl	PARAM_DST, %edi
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+	shrl	%eax
+
+	C This loop in in a single 16 byte code block already, so no
+	C alignment necessary.
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	M4_inst	(%edx,%ecx,4), %eax
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	$0, %eax
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	setc	%al
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+	movl	%ebp, SAVE_EBP
+	andl	$-2, %ecx		C size low bit masked out
+	andl	$1, PARAM_SIZE		C size low bit kept
+
+	movl	%ecx, %edi
+	decl	%ecx
+	movl	PARAM_DST, %ebp
+
+	shrl	$UNROLL_LOG2, %ecx
+	negl	%edi
+	movl	%esi, SAVE_ESI
+
+	andl	$UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
+')
+	negl	%edi
+	shrl	%eax
+
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%edi,%edi,8), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	zero
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi	scratch (was computed jump)
+	C edi	dst
+	C ebp	scratch
+
+	leal	UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %esi)
+	movl	disp1(%ebx), %ebp
+Zdisp(	M4_inst,disp0,(%edx), %esi)
+Zdisp(	movl,	%esi, disp0,(%edi))
+	M4_inst	disp1(%edx), %ebp
+	movl	%ebp, disp1(%edi)
+')
+
+	decl	%ecx
+	leal	UNROLL_BYTES(%ebx), %ebx
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	mov	PARAM_SIZE, %esi
+	movl	SAVE_EBP, %ebp
+	movl	$0, %eax
+
+	decl	%esi
+	js	L(even)
+
+	movl	(%ebx), %ecx
+	M4_inst	UNROLL_BYTES(%edx), %ecx
+	movl	%ecx, (%edi)
+L(even):
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	setc	%al
+
+	movl	SAVE_ESI, %esi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/aorsmul_1.asm b/third_party/gmp/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000..eec8df6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/aorsmul_1.asm

@@ -0,0 +1,167 @@
+dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)		 6.5
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6
+C AMD K7			 3.75
+C AMD K8
+
+C TODO
+C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
+C    but lose by 2x for n == 1.
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`func',  `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	mov	20(%esp), %edi
+	mov	24(%esp), %esi
+	mov	28(%esp), %eax
+	mov	32(%esp), %ecx
+	mov	%eax, %ebx
+	shr	$2, %eax
+	mov	%eax, 28(%esp)
+	mov	(%esi), %eax
+	and	$3, %ebx
+	jz	L(b0)
+	cmp	$2, %ebx
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	lea	-4(%esi), %esi
+	lea	-4(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebx
+	mov	%edx, %ebp
+	cmpl	$0, 28(%esp)
+	jz	L(cj1)
+	mov	8(%esi), %eax
+	jmp	L(1)
+
+L(b2):	mul	%ecx
+	mov	%eax, %ebp
+	mov	4(%esi), %eax
+	mov	%edx, %ebx
+	cmpl	$0, 28(%esp)
+	jne	L(2)
+	jmp	L(cj2)
+
+L(b3):	lea	-12(%esi), %esi
+	lea	-12(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebx
+	mov	%edx, %ebp
+	mov	16(%esi), %eax
+	incl	28(%esp)
+	jmp	L(3)
+
+L(b0):	lea	-8(%esi), %esi
+	lea	-8(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebp
+	mov	12(%esi), %eax
+	mov	%edx, %ebx
+	jmp	L(0)
+
+	ALIGN(16)
+L(top):	lea	16(%edi), %edi
+L(2):	mul	%ecx
+	ADDSUB	%ebp, 0(%edi)
+	mov	$0, %ebp
+	adc	%eax, %ebx
+	mov	8(%esi), %eax
+	adc	%edx, %ebp
+L(1):	mul	%ecx
+	ADDSUB	%ebx, 4(%edi)
+	mov	$0, %ebx
+	adc	%eax, %ebp
+	mov	12(%esi), %eax
+	adc	%edx, %ebx
+L(0):	mul	%ecx
+	ADDSUB	%ebp, 8(%edi)
+	mov	$0, %ebp
+	adc	%eax, %ebx
+	adc	%edx, %ebp
+	mov	16(%esi), %eax
+L(3):	mul	%ecx
+	ADDSUB	%ebx, 12(%edi)
+	adc	%eax, %ebp
+	mov	20(%esi), %eax
+	lea	16(%esi), %esi
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	decl	28(%esp)
+	jnz	L(top)
+
+L(end):	lea	16(%edi), %edi
+L(cj2):	mul	%ecx
+	ADDSUB	%ebp, (%edi)
+	adc	%eax, %ebx
+	adc	$0, %edx
+L(cj1):	ADDSUB	%ebx, 4(%edi)
+	adc	$0, %edx
+	mov	%edx, %eax
+	mov	(%esp), %ebp
+	mov	4(%esp), %ebx
+	mov	8(%esp), %esi
+	mov	12(%esp), %edi
+	add	$16, %esp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/bdiv_q_1.asm b/third_party/gmp/mpn/x86/k7/bdiv_q_1.asm
new file mode 100644
index 0000000..2af7bb9
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/bdiv_q_1.asm

@@ -0,0 +1,245 @@
+dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C          cycles/limb
+C Athlon:     11.0
+C Hammer:      9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort.  The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	PARAM_SHIFT, %ecx	C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	movl	PARAM_INVERSE, %eax	C inv
+
+L(common):
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	%edi, VAR_DST_END
+	xorl	%ebx, %ebx
+	jmp	L(entry)
+
+	ALIGN(8)
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or 1
+	C ecx	shift
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi,%ebp,4), %eax
+	movl	(%esi,%ebp,4), %edi
+
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebx, %eax		C apply carry bit
+	setc	%bl
+	movl	VAR_DST_END, %edi
+
+	subl	%edx, %eax		C apply carry limb
+	adcl	$0, %ebx
+
+L(entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	incl	%ebp
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi), %eax		C src high limb
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebx, %eax		C apply carry bit
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+L(one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	SAVE_EBP, %ebp
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	$-1, %ecx		C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	C If there's usually only one or two trailing zero bits then this
+	C should be faster than bsfl.
+L(strip_twos):
+	incl	%ecx
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	movl	%ebx, SAVE_EBX
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	imull	%ebx, %eax		C inv*inv*d
+
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+	leal	(%edx,%edx), %eax	C 2*inv
+
+	imull	%edx, %edx		C inv*inv
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	imull	%ebx, %edx		C inv*inv*d
+
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/dive_1.asm b/third_party/gmp/mpn/x86/k7/dive_1.asm
new file mode 100644
index 0000000..458bd02
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/dive_1.asm

@@ -0,0 +1,208 @@
+dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C          cycles/limb
+C Athlon:     11.0
+C Hammer:      9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort.  The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	$-1, %ecx		C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	C If there's usually only one or two trailing zero bits then this
+	C should be faster than bsfl.
+L(strip_twos):
+	incl	%ecx
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	movl	%ebx, SAVE_EBX
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	imull	%ebx, %eax		C inv*inv*d
+
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+	leal	(%edx,%edx), %eax	C 2*inv
+
+	imull	%edx, %edx		C inv*inv
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	imull	%ebx, %edx		C inv*inv*d
+
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	%edi, VAR_DST_END
+	xorl	%ebx, %ebx
+	jmp	L(entry)
+
+	ALIGN(8)
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or 1
+	C ecx	shift
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi,%ebp,4), %eax
+	movl	(%esi,%ebp,4), %edi
+
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebx, %eax		C apply carry bit
+	setc	%bl
+	movl	VAR_DST_END, %edi
+
+	subl	%edx, %eax		C apply carry limb
+	adcl	$0, %ebx
+
+L(entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	incl	%ebp
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi), %eax		C src high limb
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebx, %eax		C apply carry bit
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	SAVE_EBP, %ebp
+	movl	%eax, -4(%edi)
+
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/gcd_11.asm b/third_party/gmp/mpn/x86/k7/gcd_11.asm
new file mode 100644
index 0000000..2648dfd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/gcd_11.asm

@@ -0,0 +1,107 @@
+dnl  x86 mpn_gcd_11 optimised for AMD K7.
+
+dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C AMD K7	 5.31
+C AMD K8,K9	 5.33
+C AMD K10	 5.30
+C AMD bd1	 ?
+C AMD bobcat	 7.02
+C Intel P4-2	10.1
+C Intel P4-3/4	10.0
+C Intel P6/13	 5.88
+C Intel core2	 6.26
+C Intel NHM	 6.83
+C Intel SBR	 8.50
+C Intel atom	 8.90
+C VIA nano	 ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+
+define(`u0',    `%eax')
+define(`v0',    `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	push	%edi
+	push	%esi
+
+	mov	12(%esp), %eax
+	mov	16(%esp), %edx
+
+	LEAL(	ctz_table, %esi)
+	jmp	L(odd)
+
+	ALIGN(16)			C
+L(top):	cmovc(	%ecx, %eax)		C u = |v - u|
+	cmovc(	%edi, %edx)		C v = min(u,v)
+L(mid):	and	$MASK, %ecx		C
+	movzbl	(%esi,%ecx), %ecx	C
+	jz	L(shift_alot)		C
+	shr	%cl, %eax		C
+L(odd):	mov	%eax, %edi		C
+	mov	%edx, %ecx		C
+	sub	%eax, %ecx		C
+	sub	%edx, %eax		C
+	jnz	L(top)			C
+
+L(end):	mov	%edx, %eax
+	pop	%esi
+	pop	%edi
+	ret
+
+L(shift_alot):
+	shr	$MAXSHIFT, %eax
+	mov	%eax, %ecx
+	jmp	L(mid)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/gmp-mparam.h b/third_party/gmp/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000..25b22e2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/gmp-mparam.h

@@ -0,0 +1,262 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2083 MHz K7 Barton */
+/* FFT tuning limit = 49,770,069 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        24
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 27.00% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           26
+
+#define DIV_1_VS_MUL_1_PERCENT             182
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                85
+#define MUL_TOOM44_THRESHOLD               154
+#define MUL_TOOM6H_THRESHOLD               208
+#define MUL_TOOM8H_THRESHOLD               309
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     102
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     121
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 50
+#define SQR_TOOM3_THRESHOLD                 86
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                270
+#define SQR_TOOM8_THRESHOLD                446
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define MUL_FFT_MODF_THRESHOLD             606  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    606, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     28, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     63, 7}, {    127, 8}, {     71, 9}, {     39, 6}, \
+    {    319, 9}, {     47, 8}, {     99, 6}, {    399, 9}, \
+    {     55,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     79,10}, {     47, 9}, {     95, 8}, {    191, 4}, \
+    {   3135, 5}, {   1599, 4}, {   3455, 6}, {    959, 8}, \
+    {    247,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    199,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    159, 9}, {    319, 8}, {    639, 7}, {   1279,11}, \
+    {     95,10}, {    191, 9}, {    383, 8}, {    799,10}, \
+    {    207,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511, 8}, {   1023,10}, {    271, 9}, {    543, 8}, \
+    {   1087, 9}, {    575,11}, {    159, 9}, {    639,10}, \
+    {    335, 9}, {    671, 8}, {   1343,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383, 9}, {    799, 8}, \
+    {   1599,11}, {    223,10}, {    447,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543, 9}, \
+    {   1087,10}, {    575, 9}, {   1151,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    639, 9}, {   1343,10}, \
+    {    703, 9}, {   1407,12}, {    191,11}, {    383,10}, \
+    {    767, 9}, {   1535,10}, {    799, 9}, {   1599,10}, \
+    {    831, 9}, {   1727, 8}, {   3455,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023, 9}, \
+    {   2047,11}, {    543,10}, {   1087,11}, {    575,10}, \
+    {   1151, 9}, {   2303,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    671,10}, \
+    {   1343,11}, {    703,10}, {   1407,11}, {    735,10}, \
+    {   1471, 9}, {   2943,12}, {    383,11}, {    767,10}, \
+    {   1535,11}, {    799,10}, {   1599,11}, {    831,10}, \
+    {   1663,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    895,10}, {   1791,11}, {    959,10}, {   1919,13}, \
+    {    255,12}, {    511,11}, {   1023,10}, {   2111,11}, \
+    {   1087,10}, {   2175,12}, {    575,11}, {   1151,10}, \
+    {   2303,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1407,10}, {   2815,11}, \
+    {   1471,10}, {   2943,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1663,10}, {   3327,11}, \
+    {   1727,10}, {   3455,12}, {    895,11}, {   1855,12}, \
+    {    959,11}, {   1919,10}, {   3839,14}, {    255,13}, \
+    {    511,12}, {   1023,11}, {   2111,12}, {   1087,11}, \
+    {   2239,12}, {   1151,11}, {   2303,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1343,11}, {   2687,12}, \
+    {   1407,11}, {   2815,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1663,11}, {   3327,12}, {   1727,11}, \
+    {   3455,13}, {    895,12}, {   1919,11}, {   3839,12}, \
+    {   1983,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2687,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1535,12}, \
+    {   3135,13}, {   1663,12}, {   3455,13}, {   1791,12}, \
+    {   3583,13}, {   1919,12}, {   3967,15}, {    511,14}, \
+    {   1023,13}, {   2047,12}, {   4095,13}, {   2175,12}, \
+    {   4479,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2559,12}, {   5119,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,14}, {   2559,13}, {   5119,14}, {   2815,13}, \
+    {   5887,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 254
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             492  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    492, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    135, 8}, {    271, 9}, \
+    {    143,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    191, 8}, {    383,10}, {    111,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    303,10}, {    159, 9}, {    319, 8}, {    639,11}, \
+    {     95,10}, {    191, 9}, {    383, 8}, {    767, 9}, \
+    {    399,10}, {    207,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671, 8}, {   1343, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767, 8}, {   1535,10}, {    399, 9}, \
+    {    799, 8}, {   1599, 9}, {    863,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    511, 9}, \
+    {   1087,10}, {    575, 9}, {   1215,10}, {    639, 9}, \
+    {   1279,10}, {    671, 9}, {   1343,11}, {    351,10}, \
+    {    703, 9}, {   1407,10}, {    735, 9}, {   1471,12}, \
+    {    191,11}, {    383,10}, {    767, 9}, {   1535,10}, \
+    {    799, 9}, {   1599,11}, {    415,10}, {    831, 9}, \
+    {   1663,10}, {    863, 9}, {   1727, 8}, {   3455,11}, \
+    {    447,10}, {    895,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023, 9}, {   2047,11}, {    543,10}, \
+    {   1087, 9}, {   2175,11}, {    575,10}, {   1151, 9}, \
+    {   2303,11}, {    607,10}, {   1215, 9}, {   2431,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    671,10}, \
+    {   1343,11}, {    703,10}, {   1407, 9}, {   2815,11}, \
+    {    735,10}, {   1471, 9}, {   2943,12}, {    383,11}, \
+    {    767,10}, {   1599,11}, {    831,10}, {   1663, 9}, \
+    {   3327,10}, {   1727,12}, {    447,11}, {    895,10}, \
+    {   1791,11}, {    959,10}, {   1919,13}, {    255,12}, \
+    {    511,11}, {   1023,10}, {   2111,11}, {   1087,10}, \
+    {   2175,12}, {    575,11}, {   1151,10}, {   2303,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1407,10}, {   2815,11}, {   1471,10}, \
+    {   2943,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1663,10}, {   3327,11}, {   1727,10}, \
+    {   3455,12}, {    895,11}, {   1791,12}, {    959,11}, \
+    {   1919,10}, {   3839,14}, {    255,13}, {    511,12}, \
+    {   1023,11}, {   2111,12}, {   1087,11}, {   2239,12}, \
+    {   1151,11}, {   2303,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1343,11}, {   2687,12}, {   1407,11}, \
+    {   2815,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1599,11}, {   3199,12}, {   1663,11}, {   3327,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1791,11}, \
+    {   3583,12}, {   1919,11}, {   3839,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2687,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1535,12}, {   3199,13}, \
+    {   1663,12}, {   3455,13}, {   1791,12}, {   3583,13}, \
+    {   1919,12}, {   3967,15}, {    511,14}, {   1023,13}, \
+    {   2047,12}, {   4095,13}, {   2175,12}, {   4351,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4351,14}, {   2303,13}, \
+    {   4991,14}, {   2559,13}, {   5119,14}, {   2815,13}, \
+    {   5887,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 258
+#define SQR_FFT_THRESHOLD                 5504
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  34
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                 137
+#define SQRLO_SQR_THRESHOLD              10821
+
+#define DC_DIV_QR_THRESHOLD                 45
+#define DC_DIVAPPR_Q_THRESHOLD             206
+#define DC_BDIV_QR_THRESHOLD                39
+#define DC_BDIV_Q_THRESHOLD                144
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               202
+#define INV_APPR_THRESHOLD                 206
+
+#define BINV_NEWTON_THRESHOLD              224
+#define REDC_1_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1442
+#define MU_DIVAPPR_Q_THRESHOLD            1387
+#define MUPI_DIV_QR_THRESHOLD               82
+#define MU_BDIV_QR_THRESHOLD              1308
+#define MU_BDIV_Q_THRESHOLD               1387
+
+#define POWM_SEC_TABLE  1,16,102,428,1221
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               254
+#define SET_STR_PRECOMPUTE_THRESHOLD       890
+
+#define FAC_DSC_THRESHOLD                  206
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    3  /* 3.84% faster than 4 */
+#define HGCD_THRESHOLD                     123
+#define HGCD_APPR_THRESHOLD                151
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   435
+#define GCDEXT_DC_THRESHOLD                318
+#define JACOBI_BASE_METHOD                   4  /* 8.04% faster than 3 */
+
+/* Tuneup completed successfully, took 175382 seconds */

diff --git a/third_party/gmp/mpn/x86/k7/invert_limb.asm b/third_party/gmp/mpn/x86/k7/invert_limb.asm
new file mode 100644
index 0000000..31a867e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/invert_limb.asm

@@ -0,0 +1,194 @@
+dnl  x86 mpn_invert_limb
+
+dnl  Contributed to the GNU project by Niels Möller
+
+dnl  Copyright 2009, 2011, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles (approx)	div
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 ?
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C AMD K6			 ?
+C AMD K7			41		53
+C AMD K8			 ?
+
+C TODO
+C  * These c/l numbers are for a non-PIC build.  Consider falling back to using
+C    the 'div' instruction for PIC builds.
+C  * Perhaps use this file--or at least the algorithm--for more machines than k7.
+
+C Register usage:
+C   Input D in %edi
+C   Current approximation is in %eax and/or %ecx
+C   %ebx and %edx are temporaries
+C   %esi and %ebp are unused
+
+defframe(PARAM_DIVISOR,4)
+
+ASM_START()
+
+C Make approx_tab global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+	deflit(`approx_tab', MPN(invert_limb_tab))
+	GLOBL	approx_tab')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_invert_limb)
+deflit(`FRAME', 0)
+	mov	PARAM_DIVISOR, %eax
+	C Avoid push/pop on k7.
+	sub	$8, %esp	FRAME_subl_esp(8)
+	mov	%ebx, (%esp)
+	mov	%edi, 4(%esp)
+
+	mov	%eax, %edi
+	shr	$22, %eax
+ifdef(`PIC',`
+	LEAL(	approx_tab, %ebx)
+	movzwl	-1024(%ebx, %eax, 2), %eax
+',`
+	movzwl	-1024+approx_tab(%eax, %eax), %eax	C %eax = v0
+')
+
+	C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
+	mov	%eax, %ecx
+	imul	%eax, %eax
+	mov	%edi, %ebx
+	shr	$11, %ebx
+	inc	%ebx
+	mul	%ebx
+	mov	%edi, %ebx				C Prepare
+	shr	%ebx
+	sbb	%eax, %eax
+	sub	%eax, %ebx				C %ebx = d_31, %eax = mask
+	shl	$4, %ecx
+	dec	%ecx
+	sub	%edx, %ecx				C %ecx = v1
+
+	C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
+	imul	%ecx, %ebx
+	and	%ecx, %eax
+	shr	%eax
+	sub	%ebx, %eax
+	mul	%ecx
+	mov	%edi, %eax				C Prepare for next mul
+	shl	$15, %ecx
+	shr	%edx
+	add	%edx, %ecx				C %ecx = v2
+
+	mul	%ecx
+	add	%edi, %eax
+	mov	%ecx, %eax
+	adc	%edi, %edx
+	sub	%edx, %eax				C %eax = v3
+
+	mov	(%esp), %ebx
+	mov	4(%esp), %edi
+	add	$8, %esp
+
+	ret
+
+EPILOGUE()
+
+DEF_OBJECT(approx_tab,2)
+	.value	0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
+	.value	0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
+	.value	0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
+	.value	0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
+	.value	0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
+	.value	0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
+	.value	0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
+	.value	0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
+	.value	0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
+	.value	0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
+	.value	0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
+	.value	0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
+	.value	0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
+	.value	0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
+	.value	0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
+	.value	0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
+	.value	0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
+	.value	0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
+	.value	0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
+	.value	0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
+	.value	0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
+	.value	0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
+	.value	0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
+	.value	0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
+	.value	0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
+	.value	0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
+	.value	0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
+	.value	0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
+	.value	0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
+	.value	0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
+	.value	0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
+	.value	0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
+	.value	0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
+	.value	0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
+	.value	0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
+	.value	0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
+	.value	0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
+	.value	0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
+	.value	0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
+	.value	0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
+	.value	0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
+	.value	0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
+	.value	0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
+	.value	0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
+	.value	0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
+	.value	0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
+	.value	0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
+	.value	0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
+	.value	0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
+	.value	0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
+	.value	0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
+	.value	0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
+	.value	0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
+	.value	0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
+	.value	0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
+	.value	0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
+	.value	0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
+	.value	0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
+	.value	0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
+	.value	0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
+	.value	0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
+	.value	0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
+	.value	0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
+	.value	0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
+END_OBJECT(approx_tab)
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/com.asm b/third_party/gmp/mpn/x86/k7/mmx/com.asm
new file mode 100644
index 0000000..a258c22
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/com.asm

@@ -0,0 +1,125 @@
+dnl  AMD Athlon mpn_com -- mpn bitwise one's complement.
+
+dnl  Copyright 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.0 cycles/limb
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop form below is necessary for the claimed speed.  It needs to be
+C aligned to a 16 byte boundary and only 16 bytes long.  Maybe that's so it
+C fits in a BTB entry.  The adjustments to %eax and %edx avoid offsets on
+C the movq's and achieve the necessary size.
+C
+C If both src and dst are 4mod8, the loop runs at 1.5 c/l.  So long as one
+C of the two is 0mod8, it runs at 1.0 c/l.  On that basis dst is checked
+C (offset by the size, as per the loop addressing) and one high limb
+C processed separately to get alignment.
+C
+C The padding for the nails case is unattractive, but shouldn't cost any
+C cycles.  Explicit .byte's guarantee the desired instructions, at a point
+C where we're probably stalled waiting for loads anyway.
+C
+C Enhancements:
+C
+C The combination load/pxor/store might be able to be unrolled to approach
+C 0.5 c/l if desired.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+	movl	PARAM_DST, %edx
+	movl	PARAM_SIZE, %ecx
+	pcmpeqd	%mm7, %mm7
+
+	leal	(%edx,%ecx,4), %eax
+	andl	$4, %eax
+ifelse(GMP_NAIL_BITS,0,,
+`	psrld	$GMP_NAIL_BITS, %mm7')		C GMP_NUMB_MASK
+
+	movl	PARAM_SRC, %eax
+	movd	-4(%eax,%ecx,4), %mm0		C src high limb
+
+ifelse(GMP_NAIL_BITS,0,,
+`	C padding for alignment below
+	.byte	0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00	C lea 0(%esi),%esi
+	.byte	0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00	C lea 0(%edi),%edi
+')
+
+	jz	L(aligned)
+
+	pxor	%mm7, %mm0
+	movd	%mm0, -4(%edx,%ecx,4)		C dst high limb
+	decl	%ecx
+	jz	L(done)
+L(aligned):
+
+	addl	$4, %eax
+	addl	$4, %edx
+	decl	%ecx
+	jz	L(one)
+
+	C offset 0x30 for no nails, or 0x40 for nails
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter
+	C edx	dst
+
+	subl	$2, %ecx
+	movq	(%eax,%ecx,4), %mm0
+	pxor	%mm7, %mm0
+	movq	%mm0, (%edx,%ecx,4)
+	jg	L(top)
+
+	jnz	L(done)				C if size even
+
+L(one):
+	movd	-4(%eax), %mm0			C src low limb
+	pxor	%mm7, %mm0
+	movd	%mm0, -4(%edx)			C dst low limb
+
+L(done):
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/copyd.asm b/third_party/gmp/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000..59ece40
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/copyd.asm

@@ -0,0 +1,144 @@
+dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    alignment dst/src, A=0mod8 N=4mod8
+C       A/A   A/N   N/A   N/N
+C K7    0.75  1.0   1.0   0.75
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst
+	C
+	C this loop is 2 cycles/limb
+
+	movl	-4(%eax,%ecx,4), %ebx
+	movl	%ebx, -4(%edx,%ecx,4)
+	decl	%ecx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%esi, SAVE_ESI
+	leal	(%eax,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %esi
+
+	andl	%esi, %ebx
+	movl	SAVE_ESI, %esi
+	subl	$4, %ecx		C size-4
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	12(%eax,%ecx,4), %ebx
+	movl	%ebx, 12(%edx,%ecx,4)
+	decl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	(%eax,%ecx,4), %mm1
+	subl	$4, %ecx
+	movq	%mm0, 16+8(%edx,%ecx,4)
+	movq	%mm1, 16(%edx,%ecx,4)
+	jns	L(top)
+
+
+	C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %cl
+	jz	L(finish_not_two)
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	%mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jz	L(done)
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/copyi.asm b/third_party/gmp/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000..9a28f92
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/copyi.asm

@@ -0,0 +1,157 @@
+dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    alignment dst/src, A=0mod8 N=4mod8
+C       A/A   A/N   N/A   N/N
+C K7    0.75  1.0   1.0   0.75
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
+C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
+C under 0.7 c/l is known.  Apparently only two 32-bit stores can be done in
+C one cycle, so perhaps some scheduling is needed to ensure it's a
+C load+store in each cycle, not store+store.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src, incrementing
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst, incrementing
+	C
+	C this loop is 2 cycles/limb
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	decl	%ecx
+	leal	4(%eax), %eax
+	leal	4(%edx), %edx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%eax, %ebx
+	leal	-12(%eax,%ecx,4), %eax	C src end - 12
+	subl	$3, %ecx		C size-3
+
+	andl	%edx, %ebx
+	leal	(%edx,%ecx,4), %edx	C dst end - 12
+	negl	%ecx
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	(%eax,%ecx,4), %ebx
+	movl	%ebx, (%edx,%ecx,4)
+	incl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src end - 12
+	C ebx
+	C ecx	counter, negative, limbs
+	C edx	dst end - 12
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	8(%eax,%ecx,4), %mm1
+	addl	$4, %ecx
+	movq	%mm0, -16(%edx,%ecx,4)
+	movq	%mm1, -16+8(%edx,%ecx,4)
+	ja	L(top)		C jump no carry and not zero
+
+
+	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	%mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jnz	L(done)
+
+	movl	8(%eax), %ebx
+	movl	%ebx, 8(%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/divrem_1.asm b/third_party/gmp/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000..cf34328
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/divrem_1.asm

@@ -0,0 +1,832 @@
+dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
+dnl  division.
+
+dnl  Copyright 1999-2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code.  "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer.  If
+C q1==0xFFFFFFFF, then q1+1 would overflow.  We branch to a special case
+C "q1_ff" if this occurs.  Since the true quotient is either q1 or q1+1 then
+C if q1==0xFFFFFFFF that must be the right value.
+C
+C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
+C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1.  This
+C then goes through as normal, and finding no addback required.  sbbl costs
+C an extra cycle over what the main loop code does, but it keeps code size
+C and complexity down.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is used.  The latter
+C is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case.  It doesn't seem worth checking for this though, since it
+C probably occurs infrequently, in particular note that big_base for a
+C decimal mpn_get_str is not normalized in a 32-bit limb.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl  even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_DST, %edx
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
+	movl	%edi, SAVE_EDI
+	xorl	%edi, %edi		C carry
+
+	movl	-4(%esi,%ebx,4), %eax	C src high limb
+	xor	%ecx, %ecx
+
+	C
+
+	C
+
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edi)		C high is carry if high<divisor
+	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
+	sbbl	$0, %ebx		C skip one division if high<divisor
+	movl	PARAM_PREINV_SHIFT, %ecx
+
+	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
+	movl	$32, %eax
+
+	movl	%edx, VAR_DST		C &dst[xsize+size]
+
+	shll	%cl, %ebp		C d normalized
+	subl	%ecx, %eax
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7		C rshift
+	movl	PARAM_PREINV_INVERSE, %eax
+	jmp	L(start_preinv)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	C offset 0xa1, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+	orl	%ecx, %ecx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	jz	L(no_skip_div)		C if size==0
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	xorl	%esi, %esi
+
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
+
+	movl	%esi, (%edi,%ecx,4)	C dst high limb
+	sbbl	$0, %ecx		C size-1 if high<divisor
+	movl	PARAM_SRC, %esi		C reload
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	movl	%edx, %eax
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ebx, VAR_DST_STOP
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+	C eax	inverse
+	C ebx	size
+	C ecx	shift
+	C edx
+	C esi	src
+	C edi	carry
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	orl	%ebx, %ebx		C size
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	jz	L(start_zero)
+	movl	%eax, VAR_SRC
+	cmpl	$1, %ebx
+
+	movl	8(%eax), %esi		C src high limb
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	movl	%eax, VAR_SRC
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+	C skipped a division.
+
+	shll	%cl, %edi		C n2 = carry << l
+	movl	%edi, %eax		C return value for zero_done
+	cmpl	$0, PARAM_XSIZE
+
+	je	L(zero_done)
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution.  The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C		      cycles
+C		n2+n1	1
+C		mul	6
+C		q1+1	1
+C		mul	6
+C		sub	1
+C		addback	1
+C		       ---
+C		       16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing.  Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it.  In this case
+C there's 19 cycles shown, but it executes at 17.
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	VAR_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	movl	VAR_DST, %ecx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+
+	leal	-4(%ecx), %ecx
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+	cmpl	%eax, %ecx
+
+	movl	%ebx, (%ecx)
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	movl	VAR_DST_STOP, %ecx
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+L(integer_none):
+	cmpl	$0, PARAM_XSIZE
+	jne	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+L(zero_done):
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	%ecx, VAR_DST
+
+	movd	%mm0, %esi		C next n10
+
+	movl	$-1, (%ecx)
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles.  The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C		     = m*d + b*d - m - b
+C		     = floor((b(b-d)-1)/d)*d + b*d - m - b
+C		     = rnd(b(b-d)-1) + b*d - m - b
+C		     = rnd(b(b-d)-1 + b*d) - m - b
+C		     = rnd(b*b-1) - m - b
+C		     <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C	n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution.  There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
+	movl	%edi, %eax
+
+	subl	$8, %ecx		C &dst[xsize]
+	jmp	L(fraction_entry)
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2 carry, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	(will be n2)
+	C ebp	divisor
+
+	movl	%ebx, (%ecx)	C previous q
+	movl	%eax, %edi	C remainder->n2
+
+L(fraction_entry):
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	C
+
+	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal	(%ebp,%eax), %edx
+
+	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	C q
+	cmpl	%esi, %ecx
+
+	jne	L(fraction_top)
+
+
+	movl	%ebx, (%ecx)
+	jmp	L(fraction_done)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/lshift.asm b/third_party/gmp/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000..b3383cf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/lshift.asm

@@ -0,0 +1,481 @@
+dnl  AMD K7 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx
+
+	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+	shll	%cl, %edx
+
+	movl	%edx, (%edi)
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6
+	movd	(%edx,%eax,4), %mm5	C src high limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	negl	%ecx
+	movd	(%edx), %mm4		C src low limb
+
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+
+L(simple_top):
+	C eax	loop counter, limbs
+	C ebx
+	C ecx
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src low limb
+	C mm5	src high limb
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%edx,%eax,4), %mm0
+	decl	%eax
+
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	%mm6, %mm5
+	psllq	%mm6, %mm4
+
+	psrlq	$32, %mm5
+	movd	%mm4, (%edi)		C dst low limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	(saved)
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src high limb, for return value
+	C mm6	lshift
+
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
+
+	testb	$4, %dl
+	movq	(%edx), %mm1		C src high qword
+
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb (marked xxx) separately to
+	C make it so
+	C
+	C  source    -4(edx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |  xxx          |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest      -4(edi,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |  xxx  |       |
+	C  +-------+-------+--
+
+	psllq	%mm6, %mm1
+	subl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	psrlq	$32, %mm1
+	decl	%eax			C size-2 is new size-1
+
+	movd	%mm1, 4(%edi,%eax,4)
+	movq	(%edx), %mm1		C new src high qword
+L(start_src_aligned):
+
+
+	leal	-4(%edi,%eax,4), %edi   C &dst[size-2]
+	psllq	%mm6, %mm5
+
+	testl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
+	C here separately.
+	C
+	C  source       %edx
+	C  +-------+-------+--
+	C  |      mm1      |
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest         %edi
+	C  +-------+-------+-------+--
+	C  |  xxx  |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	psllq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	psrlq	$32, %mm1
+
+	movd	%mm1, 4(%edi)
+	movq	%mm0, %mm1
+	subl	$4, %edi
+
+	movd	%ecx, %mm6		C new lshift
+L(start_dst_aligned):
+
+	decl	%eax			C size-2, two last limbs handled at end
+	movq	%mm1, %mm2		C copy of src high qword
+	negl	%ecx
+
+	andl	$-2, %eax		C round size down to even
+	addl	$64, %ecx
+
+	movl	%eax, %ebx
+	negl	%eax
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C rshift = 64-lshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	size (for use at end)
+	C ebx	loop counter
+	C ecx	rshift
+	C edx	src
+	C esi	computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating, mm2 first)
+	C mm2	/
+	C mm6	lshift
+	C mm7	rshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 - 8))
+
+Zdisp(	movq,	disp0,(%edx), %mm0)
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm2, %mm0
+Zdisp(	movq,	%mm0, disp0,(%edi))
+
+
+Zdisp(	movq,	disp1,(%edx), %mm0)
+	psllq	%mm6, %mm1
+
+	movq	%mm0, %mm2
+	psrlq	%mm7, %mm0
+
+	por	%mm1, %mm0
+Zdisp(	movq,	%mm0, disp1,(%edi))
+')
+
+	subl	$UNROLL_BYTES, %edx
+	subl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+	testb	$1, %al
+	movl	SAVE_EBX, %ebx
+	psllq	%mm6, %mm2	C wanted left shifted in all cases below
+
+	movd	%mm5, %eax
+
+	movl	SAVE_ESI, %esi
+	jz	L(end_even)
+
+
+L(end_odd):
+
+	C Size odd, destination was aligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+-------+
+	C   |   written     |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest                            edi
+	C         --+---------------+---------------+
+	C           |   written     |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at (%edi), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+	movd	disp(4) (%edx), %mm0
+	testb	$32, %cl
+
+	movq	%mm0, %mm1
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+	psllq	%mm6, %mm1
+
+	por	%mm2, %mm0
+
+	movq	%mm0, disp(0) (%edi)
+	jz	L(end_odd_unaligned)
+	movd	%mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C                 source        edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+
+	C   |   written     |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C               source          edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest                  edi+4
+	C         --+---------------+-------+
+	C           |    written    |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movq for the aligned case overwrites the movd for the
+	C unaligned case.
+
+	movq	%mm2, %mm0
+	psrlq	$32, %mm2
+
+	testb	$32, %cl
+	movd	%mm2, disp(4) (%edi)
+
+	jz	L(end_even_unaligned)
+	movq	%mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/popham.asm b/third_party/gmp/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000..95965b7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/popham.asm

@@ -0,0 +1,213 @@
+dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			     popcount	     hamdist
+C P3 generic			6.5		7
+C P3 model 9  (Banias)          5.7		6.1
+C P3 model 13 (Dothan)		5.75		6
+C K7				5		6
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code.  The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	RODATA
+	ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+L(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+
+',`
+	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	L(rodata_3333333333333333), %mm6
+	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+	pxor	%mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+	movd	(%eax,%ecx,8), %mm1
+
+HAM(`	movd	(%edx,%ecx,8), %mm0
+	pxor	%mm0, %mm1
+')
+	orl	%ecx, %ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+	decl	%ecx
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+
+	psubd	%mm1, %mm0	C bit pairs
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+
+	psadbw(	%mm4, %mm0)
+
+	paddd	%mm0, %mm2	C add to total
+	jnz	L(top)
+
+
+	movd	%mm2, %eax
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/rshift.asm b/third_party/gmp/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000..345d23a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/rshift.asm

@@ -0,0 +1,480 @@
+dnl  AMD K7 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx		C src limb
+
+	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+	shrl	%cl, %edx
+
+	movl	%edx, (%edi)		C dst limb
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6	C rshift
+	movd	(%edx), %mm5		C src low limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	leal	(%edx,%eax,4), %edx	C &src[size-1]
+	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
+
+	movd	(%edx), %mm4		C src high limb
+	negl	%eax
+
+
+L(simple_top):
+	C eax	loop counter, limbs, negative
+	C ebx
+	C ecx	shift
+	C edx	carry
+	C edx	&src[size-1]
+	C edi	&dst[size-2]
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src high limb
+	C mm5	src low limb
+	C mm6	shift
+
+	movq	(%edx,%eax,4), %mm0
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	$32, %mm5
+	psrlq	%mm6, %mm4
+
+	psrlq	%mm6, %mm5
+	movd	%mm4, 4(%edi)		C dst high limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src low limb
+	C mm6	rshift
+
+	testb	$4, %dl
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  edx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edi
+	C         --+-------+-------+
+	C           |       |  xxx  |
+	C         --+-------+-------+
+
+	movq	(%edx), %mm0		C src low two limbs
+	addl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	addl	$4, %edi
+	decl	%eax			C size-2 is new size-1
+
+	psrlq	%mm6, %mm0
+	movl	%edi, PARAM_DST		C new dst
+
+	movd	%mm0, -4(%edi)
+L(start_src_aligned):
+
+
+	movq	(%edx), %mm1		C src low two limbs
+	decl	%eax			C size-2, two last limbs handled at end
+	testl	$4, %edi
+
+	psrlq	%mm6, %mm5
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
+	C
+	C          source          edx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edi
+	C  --+-------+-------+-------+
+	C                    |  xxx  |
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	psrlq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	movd	%mm1, (%edi)
+	movq	%mm0, %mm1
+	addl	$4, %edi		C new dst
+
+	movd	%ecx, %mm6
+L(start_dst_aligned):
+
+
+	movq	%mm1, %mm2		C copy of src low two limbs
+	negl	%ecx
+	andl	$-2, %eax		C round size down to even
+
+	movl	%eax, %ebx
+	negl	%eax
+	addl	$64, %ecx
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C lshift = 64-rshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+	negl	%eax
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	negl	%eax
+
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(64)
+L(top):
+	C eax	size, for use at end
+	C ebx	loop counter
+	C ecx	lshift
+	C edx	src
+	C esi	was computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating)
+	C mm2	/
+	C mm6	rshift
+	C mm7	lshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 8))
+
+Zdisp(	movq,	disp0,(%edx), %mm0)
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm2, %mm0
+Zdisp(	movq,	%mm0, disp0,(%edi))
+
+
+Zdisp(	movq,	disp1,(%edx), %mm0)
+	psrlq	%mm6, %mm1
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm1, %mm0
+Zdisp(	movq,	%mm0, disp1,(%edi))
+')
+
+	addl	$UNROLL_BYTES, %edx
+	addl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
+	movl	SAVE_ESI, %esi
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EBX, %ebx
+	jz	L(end_even)
+
+
+	C Size odd, destination was aligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                  edi
+	C +-------+---------------+---------------+--
+	C |       |               |    written    |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |    written    |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword to store, and in the aligned case there's
+	C a further extra limb of dst to be formed.
+
+
+	movd	disp0(%edx), %mm0
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	testb	$32, %cl
+
+	por	%mm2, %mm0
+	psrlq	%mm6, %mm1
+
+	movq	%mm0, disp0(%edi)
+	jz	L(finish_odd_unaligned)
+
+	movd	%mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest  edi
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is the same data as the movq for
+	C the aligned case, it's just a choice between whether one or two
+	C limbs should be written.
+
+
+	testb	$32, %cl
+	movd	%mm2, disp0(%edi)
+
+	jz	L(end_even_unaligned)
+
+	movq	%mm2, disp0(%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mod_1_1.asm b/third_party/gmp/mpn/x86/k7/mod_1_1.asm
new file mode 100644
index 0000000..1bbe6f9
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mod_1_1.asm

@@ -0,0 +1,221 @@
+dnl  x86-32 mpn_mod_1_1p, requiring cmov.
+
+dnl  Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
+
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 ?
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C AMD K6			 ?
+C AMD K7			 7
+C AMD K8			 ?
+
+define(`B2mb', `%ebx')
+define(`r0', `%esi')
+define(`r2', `%ebp')
+define(`t0', `%edi')
+define(`ap', `%ecx')  C Also shift count
+
+C Stack frame
+C	pre	36(%esp)
+C	b	32(%esp)
+C	n	28(%esp)
+C	ap	24(%esp)
+C	return	20(%esp)
+C	%ebp	16(%esp)
+C	%edi	12(%esp)
+C	%esi	8(%esp)
+C	%ebx	4(%esp)
+C	B2mod	(%esp)
+
+define(`B2modb', `(%esp)')
+define(`n', `28(%esp)')
+define(`b', `32(%esp)')
+define(`pre', `36(%esp)')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
+C
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mod_1_1p)
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	mov	32(%esp), %ebp		C pre[]
+
+	mov	12(%ebp), %eax		C B2modb
+	push	%eax			C Put it on stack
+
+	mov	n, %edx
+	mov	24(%esp), ap
+
+	lea	(ap, %edx, 4), ap
+	mov	-4(ap), %eax
+	cmp	$3, %edx
+	jnc	L(first)
+	mov	-8(ap), r0
+	jmp	L(reduce_two)
+
+L(first):
+	C First iteration, no r2
+	mull	B2modb
+	mov	-12(ap), r0
+	add	%eax, r0
+	mov	-8(ap), %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+	subl	$3, n
+	lea	-16(ap), ap
+	jz	L(reduce_three)
+
+	mov	B2modb, B2mb
+	sub	b, B2mb
+	lea	(B2mb, r0), t0
+	jmp	L(mid)
+
+	ALIGN(16)
+L(top): C Loopmixed to 7 c/l on k7
+	add	%eax, r0
+	lea	(B2mb, r0), t0
+	mov	r2, %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+L(mid):	mull	B2modb
+	and	B2modb, r2
+	add	r0, r2
+	decl	n
+	mov	(ap), r0
+	cmovc(	t0, r2)
+	lea	-4(ap), ap
+	jnz	L(top)
+
+	add	%eax, r0
+	mov	r2, %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+
+L(reduce_three):
+	C Eliminate r2
+	and	b, r2
+	sub	r2, %eax
+
+L(reduce_two):
+	mov	pre, %ebp
+	movb	4(%ebp), %cl
+	test	%cl, %cl
+	jz	L(normalized)
+
+	C Unnormalized, use B1modb to reduce to size < B b
+	mull	8(%ebp)
+	xor	t0, t0
+	add	%eax, r0
+	adc	%edx, t0
+	mov	t0, %eax
+
+	C Left-shift to normalize
+	shld	%cl, r0, %eax C Always use shld?
+
+	shl	%cl, r0
+	jmp	L(udiv)
+
+L(normalized):
+	mov	%eax, t0
+	sub	b, t0
+	cmovnc(	t0, %eax)
+
+L(udiv):
+	lea	1(%eax), t0
+	mull	(%ebp)
+	mov	b, %ebx		C Needed in register for lea
+	add	r0, %eax
+	adc	t0, %edx
+	imul	%ebx, %edx
+	sub	%edx, r0
+	cmp	r0, %eax
+	lea	(%ebx, r0), %eax
+	cmovnc(	r0, %eax)
+	cmp	%ebx, %eax
+	jnc	L(fix)
+L(ok):	shr	%cl, %eax
+
+	add	$4, %esp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+
+	ret
+L(fix):	sub	%ebx, %eax
+	jmp	L(ok)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+	push	%ebp
+	mov	12(%esp), %ebp
+	push	%esi
+	bsr	%ebp, %ecx
+	push	%ebx
+	xor	$31, %ecx
+	mov	16(%esp), %esi
+	sal	%cl, %ebp
+	mov	%ebp, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebp			C On K7, invert_limb would be a few cycles faster.
+	mov	%eax, (%esi)		C store bi
+	mov	%ecx, 4(%esi)		C store cnt
+	neg	%ebp
+	mov	$1, %edx
+	shld	%cl, %eax, %edx
+	imul	%ebp, %edx
+	shr	%cl, %edx
+	imul	%ebp, %eax
+	mov	%edx, 8(%esi)		C store B1modb
+	mov	%eax, 12(%esi)		C store B2modb
+	pop	%ebx
+	pop	%esi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mod_1_4.asm b/third_party/gmp/mpn/x86/k7/mod_1_4.asm
new file mode 100644
index 0000000..bb7597e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mod_1_4.asm

@@ -0,0 +1,260 @@
+dnl  x86-32 mpn_mod_1s_4p, requiring cmov.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 6
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	15.5
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C AMD K6			 ?
+C AMD K7			 4.75
+C AMD K8			 ?
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	sub	$28, %esp
+	mov	60(%esp), %edi		C cps[]
+	mov	8(%edi), %eax
+	mov	12(%edi), %edx
+	mov	16(%edi), %ecx
+	mov	20(%edi), %esi
+	mov	24(%edi), %edi
+	mov	%eax, 4(%esp)
+	mov	%edx, 8(%esp)
+	mov	%ecx, 12(%esp)
+	mov	%esi, 16(%esp)
+	mov	%edi, 20(%esp)
+	mov	52(%esp), %eax		C n
+	xor	%edi, %edi
+	mov	48(%esp), %esi		C up
+	lea	-12(%esi,%eax,4), %esi
+	and	$3, %eax
+	je	L(b0)
+	cmp	$2, %eax
+	jc	L(b1)
+	je	L(b2)
+
+L(b3):	mov	4(%esi), %eax
+	mull	4(%esp)
+	mov	(%esi), %ebp
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	8(%esi), %eax
+	mull	8(%esp)
+	lea	-12(%esi), %esi
+	jmp	L(m0)
+
+L(b0):	mov	(%esi), %eax
+	mull	4(%esp)
+	mov	-4(%esi), %ebp
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	4(%esi), %eax
+	mull	8(%esp)
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	8(%esi), %eax
+	mull	12(%esp)
+	lea	-16(%esi), %esi
+	jmp	L(m0)
+
+L(b1):	mov	8(%esi), %ebp
+	lea	-4(%esi), %esi
+	jmp	L(m1)
+
+L(b2):	mov	8(%esi), %edi
+	mov	4(%esi), %ebp
+	lea	-8(%esi), %esi
+	jmp	L(m1)
+
+	ALIGN(16)
+L(top):	mov	(%esi), %eax
+	mull	4(%esp)
+	mov	-4(%esi), %ebx
+	xor	%ecx, %ecx
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	4(%esi), %eax
+	mull	8(%esp)
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	8(%esi), %eax
+	mull	12(%esp)
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	lea	-16(%esi), %esi
+	mov	16(%esp), %eax
+	mul	%ebp
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	20(%esp), %eax
+	mul	%edi
+	mov	%ebx, %ebp
+	mov	%ecx, %edi
+L(m0):	add	%eax, %ebp
+	adc	%edx, %edi
+L(m1):	subl	$4, 52(%esp)
+	ja	L(top)
+
+L(end):	mov	4(%esp), %eax
+	mul	%edi
+	mov	60(%esp), %edi
+	add	%eax, %ebp
+	adc	$0, %edx
+	mov	4(%edi), %ecx
+	mov	%edx, %esi
+	mov	%ebp, %eax
+	sal	%cl, %esi
+	mov	%ecx, %ebx
+	neg	%ecx
+	shr	%cl, %eax
+	or	%esi, %eax
+	lea	1(%eax), %esi
+	mull	(%edi)
+	mov	%ebx, %ecx
+	mov	%eax, %ebx
+	mov	%ebp, %eax
+	mov	56(%esp), %ebp
+	sal	%cl, %eax
+	add	%eax, %ebx
+	adc	%esi, %edx
+	imul	%ebp, %edx
+	sub	%edx, %eax
+	lea	(%eax,%ebp), %edx
+	cmp	%eax, %ebx
+	cmovc(	%edx, %eax)
+	mov	%eax, %edx
+	sub	%ebp, %eax
+	cmovc(	%edx, %eax)
+	add	$28, %esp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
+	mov	24(%esp), %ebx
+	bsr	%ebx, %ecx
+	xor	$31, %ecx
+	sal	%cl, %ebx		C b << cnt
+	mov	%ebx, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebx
+	xor	%edi, %edi
+	sub	%ebx, %edi
+	mov	$1, %esi
+	mov	%eax, (%ebp)		C store bi
+	mov	%ecx, 4(%ebp)		C store cnt
+	shld	%cl, %eax, %esi
+	imul	%edi, %esi
+	mov	%eax, %edi
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 8(%ebp)		C store B1modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 12(%ebp)		C store B2modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 16(%ebp)		C store B3modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 20(%ebp)		C store B4modb
+
+	not	%edx
+	imul	%ebx, %edx
+	add	%edx, %ebx
+	cmp	%edx, %eax
+	cmovnc(	%edx, %ebx)
+
+	shr	%cl, %ebx
+	mov	%ebx, 24(%ebp)		C store B5modb
+
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mod_34lsub1.asm b/third_party/gmp/mpn/x86/k7/mod_34lsub1.asm
new file mode 100644
index 0000000..ee3ad04
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mod_34lsub1.asm

@@ -0,0 +1,188 @@
+dnl  AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C Athlon:     1
+C Hammer:     1
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C The loop form below and the 64 byte code alignment seem necessary for the
+C claimed speed.  This is a bit strange, since normally k7 isn't very
+C sensitive to such things.  Perhaps there has to be 6 instructions in the
+C first 16 bytes for the BTB entry or something.
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EDI, `PARAM_SIZE')
+
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+
+	movl	(%edx), %eax
+	jb	L(one)
+
+	movl	4(%edx), %ecx
+	movl	%eax, %edx
+	shrl	$24, %eax		C src[0] low
+
+	andl	$0xFFFFFF, %edx		C src[0] high
+	addl	%edx, %eax
+	movl	%ecx, %edx
+
+	andl	$0xFFFF, %ecx
+	shrl	$16, %edx		C src[1] high
+	addl	%edx, %eax
+
+	shll	$8, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+
+	pushl	%ebx	FRAME_pushl()
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+
+	movl	%edi, SAVE_EDI
+	pushl	%esi	FRAME_pushl()
+	xorl	%esi, %esi		C and clear carry flag
+
+
+	C code offset 0x40 at this point
+L(top):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx	counter, limbs
+	C edx	src
+	C esi	acc 2mod3
+	C edi
+
+	leal	24(%edx), %edx
+	leal	-2(%ecx), %ecx
+	adcl	-24(%edx), %eax
+	adcl	-20(%edx), %ebx
+	adcl	-16(%edx), %esi
+
+	decl	%ecx
+	jng	L(done_loop)
+
+	leal	-2(%ecx), %ecx
+	adcl	-12(%edx), %eax
+	adcl	-8(%edx), %ebx
+	adcl	-4(%edx), %esi
+
+	decl	%ecx
+	jg	L(top)
+
+
+	leal	12(%edx), %edx
+
+
+L(done_loop):
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+	incl	%ecx
+	movl	$0xFFFFFFFF, %edi
+	js	L(combine)
+
+	adcl	-12(%edx), %eax
+	decl	%ecx
+	movl	$0xFFFFFF00, %edi
+	js	L(combine)
+
+	adcl	-8(%edx), %ebx
+	movl	$0xFFFF0000, %edi
+
+
+L(combine):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx
+	C edx
+	C esi	acc 2mod3
+	C edi	mask
+
+	sbbl	%ecx, %ecx		C carry
+	movl	%eax, %edx		C 0mod3
+	shrl	$24, %eax		C 0mod3 high
+
+	andl	%edi, %ecx		C carry masked
+	andl	$0x00FFFFFF, %edx	C 0mod3 low
+	movl	%ebx, %edi		C 1mod3
+
+	subl	%ecx, %eax		C apply carry
+	shrl	$16, %ebx		C 1mod3 high
+	andl	$0xFFFF, %edi
+
+	addl	%edx, %eax		C apply 0mod3 low
+	movl	%esi, %edx		C 2mod3
+	shll	$8, %edi		C 1mod3 low
+
+	addl	%ebx, %eax		C apply 1mod3 high
+	shrl	$8, %esi		C 2mod3 high
+	movzbl	%dl, %edx		C 2mod3 low
+
+	addl	%edi, %eax		C apply 1mod3 low
+	shll	$16, %edx		C 2mod3 low
+
+	addl	%esi, %eax		C apply 2mod3 high
+	popl	%esi	FRAME_popl()
+
+	movl	SAVE_EDI, %edi
+	addl	%edx, %eax		C apply 2mod3 low
+	popl	%ebx	FRAME_popl()
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mode1o.asm b/third_party/gmp/mpn/x86/k7/mode1o.asm
new file mode 100644
index 0000000..2394033
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mode1o.asm

@@ -0,0 +1,181 @@
+dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C          cycles/limb
+C Athlon:     11.0
+C Hammer:      7.0
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C With the loop running at just 11 cycles it doesn't seem worth bothering to
+C check for high<divisor to save one step.
+C
+C Using a divl for size==1 measures slower than the modexact method, which
+C is not too surprising since for the latter it's only about 24 cycles to
+C calculate the modular inverse.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+deflit(STACK_SPACE, 16)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+L(start_1c):
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_DIVISOR, %esi
+
+	movl	%edi, SAVE_EDI
+
+	shrl	%eax			C d/2
+
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edi)
+	movzbl	(%eax,%edi), %edi		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
+')
+
+	xorl	%edx, %edx		C initial extra carry
+	leal	(%edi,%edi), %eax	C 2*inv
+
+	imull	%edi, %edi		C inv*inv
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SRC, %ebx
+
+	imull	%esi, %edi		C inv*inv*d
+
+	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %edi	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	%esi, %eax		C inv*inv*d
+
+	leal	(%ebx,%ebp,4), %ebx	C src end
+	negl	%ebp			C -size
+
+	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	%esi, %eax
+	imull	%edi, %eax
+	cmpl	$1, %eax')
+
+
+C The dependent chain here is
+C
+C                            cycles
+C	subl	%edx, %eax	1
+C	imull	%edi, %eax	4
+C	mull	%esi		6  (high limb)
+C			      ----
+C       total		       11
+C
+C Out of order execution hides the load latency for the source data, so no
+C special scheduling is required.
+
+L(top):
+	C eax	src limb
+	C ebx	src end ptr
+	C ecx	next carry bit, 0 or 1 (or initial carry param)
+	C edx	carry limb, high of last product
+	C esi	divisor
+	C edi	inverse
+	C ebp	counter, limbs, negative
+
+	movl	(%ebx,%ebp,4), %eax
+
+	subl	%ecx, %eax		C apply carry bit
+	movl	$0, %ecx
+
+	setc	%cl			C new carry bit
+
+	subl	%edx, %eax		C apply carry limb
+	adcl	$0, %ecx
+
+	imull	%edi, %eax
+
+	mull	%esi
+
+	incl	%ebp
+	jnz	L(top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	leal	(%ecx,%edx), %eax
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/mul_1.asm b/third_party/gmp/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000..755cd2e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mul_1.asm

@@ -0,0 +1,237 @@
+dnl  AMD K7 mpn_mul_1.
+
+dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12)
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6
+C AMD K7			 3.25
+C AMD K8
+
+C TODO
+C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
+C    but we might be able to do even better.
+C  * The feed-in code for mul_1c is crude.
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	mov	20(%esp), %edi
+	mov	24(%esp), %esi
+	mov	28(%esp), %ebp
+	mov	32(%esp), %ecx
+	mov	%ebp, %ebx
+	shr	$2, %ebp
+	mov	%ebp, 28(%esp)
+	mov	(%esi), %eax
+	and	$3, %ebx
+	jz	L(c0)
+	cmp	$2, %ebx
+	mov	36(%esp), %ebx
+	jz	L(c2)
+	jg	L(c3)
+
+L(c1):	lea	-4(%edi), %edi
+	mul	%ecx
+	test	%ebp, %ebp
+	jnz	1f
+	add	%ebx, %eax
+	mov	%eax, 4(%edi)
+	mov	%edx, %eax
+	adc	%ebp, %eax
+	jmp	L(rt)
+1:	add	%eax, %ebx
+	mov	$0, %ebp
+	adc	%edx, %ebp
+	mov	4(%esi), %eax
+	jmp	L(1)
+
+L(c2):	lea	4(%esi), %esi
+	mul	%ecx
+	test	%ebp, %ebp
+	mov	%ebx, %ebp
+	jnz	2f
+	add	%eax, %ebp
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	mov	(%esi), %eax
+	jmp	L(cj2)
+2:	add	%eax, %ebp
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	mov	(%esi), %eax
+	jmp	L(2)
+
+L(c3):	lea	8(%esi), %esi
+	lea	-12(%edi), %edi
+	mul	%ecx
+	add	%eax, %ebx
+	mov	$0, %ebp
+	adc	%edx, %ebp
+	mov	-4(%esi), %eax
+	incl	28(%esp)
+	jmp	L(3)
+
+L(c0):	mov	36(%esp), %ebx
+	lea	-4(%esi), %esi
+	lea	-8(%edi), %edi
+	mul	%ecx
+	mov	%ebx, %ebp
+	add	%eax, %ebp
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	mov	8(%esi), %eax
+	jmp	L(0)
+
+EPILOGUE()
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	mov	20(%esp), %edi
+	mov	24(%esp), %esi
+	mov	28(%esp), %ebp
+	mov	32(%esp), %ecx
+	mov	%ebp, %ebx
+	shr	$2, %ebp
+	mov	%ebp, 28(%esp)
+	mov	(%esi), %eax
+	and	$3, %ebx
+	jz	L(b0)
+	cmp	$2, %ebx
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	lea	-4(%edi), %edi
+	mul	%ecx
+	test	%ebp, %ebp
+	jnz	L(gt1)
+	mov	%eax, 4(%edi)
+	mov	%edx, %eax
+	jmp	L(rt)
+L(gt1):	mov	%eax, %ebx
+	mov	%edx, %ebp
+	mov	4(%esi), %eax
+	jmp	L(1)
+
+L(b2):	lea	4(%esi), %esi
+	mul	%ecx
+	test	%ebp, %ebp
+	mov	%eax, %ebp
+	mov	%edx, %ebx
+	mov	(%esi), %eax
+	jnz	L(2)
+	jmp	L(cj2)
+
+L(b3):	lea	8(%esi), %esi
+	lea	-12(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebx
+	mov	%edx, %ebp
+	mov	-4(%esi), %eax
+	incl	28(%esp)
+	jmp	L(3)
+
+L(b0):	lea	-4(%esi), %esi
+	lea	-8(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebp
+	mov	%edx, %ebx
+	mov	8(%esi), %eax
+	jmp	L(0)
+
+	ALIGN(16)
+L(top):	mov	$0, %ebx
+	adc	%edx, %ebx
+L(2):	mul	%ecx
+	add	%eax, %ebx
+	mov	%ebp, 0(%edi)
+	mov	4(%esi), %eax
+	mov	$0, %ebp
+	adc	%edx, %ebp
+L(1):	mul	%ecx
+	add	%eax, %ebp
+	mov	8(%esi), %eax
+	mov	%ebx, 4(%edi)
+	mov	$0, %ebx
+	adc	%edx, %ebx
+L(0):	mov	%ebp, 8(%edi)
+	mul	%ecx
+	add	%eax, %ebx
+	mov	12(%esi), %eax
+	lea	16(%esi), %esi
+	mov	$0, %ebp
+	adc	%edx, %ebp
+L(3):	mov	%ebx, 12(%edi)
+	mul	%ecx
+	lea	16(%edi), %edi
+	add	%eax, %ebp
+	decl	28(%esp)
+	mov	0(%esi), %eax
+	jnz	L(top)
+
+L(end):	mov	$0, %ebx
+	adc	%edx, %ebx
+L(cj2):	mul	%ecx
+	add	%eax, %ebx
+	mov	%ebp, (%edi)
+L(cj1):	mov	%ebx, 4(%edi)
+	adc	$0, %edx
+	mov	%edx, %eax
+
+L(rt):	mov	(%esp), %ebp
+	mov	4(%esp), %ebx
+	mov	8(%esp), %esi
+	mov	12(%esp), %edi
+	add	$16, %esp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/mul_basecase.asm b/third_party/gmp/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000..4dfb500
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mul_basecase.asm

@@ -0,0 +1,602 @@
+dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+C     limbs/loop unrolling).
+
+
+
+dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl           8           4.67
+dnl          16           4.59
+dnl          32           4.42
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl  done with a straight run through a block of code, no inner loop.  Using
+dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.  The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	mull	(%edx)
+
+	movl	PARAM_WP, %ecx
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+	decl	PARAM_YSIZE
+	pushl	%ebx		defframe_pushl(`SAVE_EBX')
+	movl	%eax, %ecx	C yp low limb
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi		defframe_pushl(`SAVE_ESI')
+	movl	%edx, %esi	C xp
+
+	movl	(%edx), %eax	C xp low limb
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+
+	movl	%esi, 4(%ebx)
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+
+dnl  FRAME carries on from previous
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi		defframe_pushl(`SAVE_EDI')
+	movl	%edx, %edi	C carry, for wp[1]
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+	movl	4(%ecx), %ecx	C yp[1]
+	movl	%edi, 4(%ebx)
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+	movl	(%esi), %eax	C xp[0]
+
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	movl	%edi, 8(%ebx)
+
+	adcl	$0, %esi
+	movl	SAVE_EDI, %edi
+	movl	%esi, 12(%ebx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+
+dnl  FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+	subl	$16, %esp
+deflit(`FRAME',16)
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_WP, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+
+	movl	%esi, SAVE_ESI
+	xorl	%ebx, %ebx
+	leal	(%edx,%ecx,4), %esi	C xp end
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	PARAM_XSIZE, %ecx
+
+	movl	%ebx, (%edi)		C final carry
+	decl	%edx
+
+	jnz	L(ysize_more_than_one)
+
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_ESI, %esi
+	addl	$FRAME, %esp
+
+	ret
+
+
+L(ysize_more_than_one):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+	C simple addmul looping
+	C
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	leal	4(%eax,%edx,4), %ebp	C yp end
+	negl	%ecx
+	negl	%edx
+
+	movl	(%esi,%ecx,4), %eax	C xp low limb
+	movl	%edx, PARAM_YSIZE	C -(ysize-1)
+	incl	%ecx
+
+	xorl	%ebx, %ebx		C initial carry
+	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
+	movl	%ebp, PARAM_YP
+
+	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
+	jmp	L(simple_outer_entry)
+
+
+	C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	PARAM_YP, %edx
+	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
+	xorl	%ebx, %ebx		C carry
+
+	movl	%ebp, PARAM_YSIZE
+	addl	$4, %edi		C next position in wp
+
+	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
+	movl	-4(%esi,%ecx,4), %eax	C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+	C eax	xp limb
+	C ebx	carry limb
+	C ecx	loop counter (negative)
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ecx
+	movl	%edx, %ebx
+	jnz	L(simple_inner)
+
+
+	mull	%ebp
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	addl	%ebx, (%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, 4(%edi)
+	jnz	L(simple_outer_top)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER,  -20)
+defframe(VAR_ADJUST,   -24)
+defframe(VAR_JMP,      -28)
+defframe(VAR_XP_LOW,   -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	movl	PARAM_XP, %esi
+	movl	4(%eax), %ebp		C multiplier (yp second limb)
+	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
+
+	movl	PARAM_WP, %edi
+	movl	%eax, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
+	decl	%ecx				C xsize-1
+
+	movl	(%esi), %eax		C xp low limb
+	andl	$-UNROLL_MASK-1, %ebx
+	negl	%ecx
+
+	subl	$VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+	negl	%ebx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ebx, VAR_ADJUST
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+	sarl	$UNROLL_LOG2, %ebx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+	negl	%edx
+
+	movl	%eax, VAR_XP_LOW
+	movl	%ecx, VAR_JMP
+	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
+	leal	4(%esi,%edx,4), %esi	C  and start at second limb
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%ecx,%edx,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	VAR_ADJUST, %ebx
+	movl	PARAM_YP, %edx
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
+
+	leal	4(%edi,%ebx,4), %edi
+	leal	(%esi,%ebx,4), %esi
+	sarl	$UNROLL_LOG2, %ebx
+
+	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
+	movl	VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	testb	$1, %cl		C and clear carry bit
+	movl	%ebx, VAR_COUNTER
+	movl	$0, %ebx
+
+	movl	$0, %ecx
+	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
+	cmovnz(	%eax, %ebx)
+
+	C Extra fetch of VAR_JMP is bad, but registers are tight
+	jmp	*VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	xp limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	xp+8
+	C edi	wp
+	C ebp	yp multiplier limb
+	C
+	C VAR_COUNTER  loop counter, negative
+	C
+	C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ebx
+
+	mull	%ebp
+
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ecx
+
+	mull	%ebp
+
+	addl	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+')
+
+
+	incl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jnz	L(unroll_top)
+
+
+	C eax
+	C ebx	zero
+	C ecx	low
+	C edx	high
+	C esi
+	C edi	wp, pointing at second last limb)
+	C ebp
+	C
+	C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	movl	PARAM_YSIZE, %ebp
+	adcl	$0, %edx
+	addl	%ecx, disp0(%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, disp1(%edi)
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/sqr_basecase.asm b/third_party/gmp/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000..7b6a97e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/sqr_basecase.asm

@@ -0,0 +1,635 @@
+dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+C     (measured on the speed difference between 25 and 50 limbs, which is
+C     roughly the Karatsuba recursing range).
+
+
+dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl  some comments.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	cmpl	$2, %ecx
+
+	movl	PARAM_DST, %edx
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%edx, 4(%ecx)
+	movl	%eax, (%ecx)
+	ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers.  Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+deflit(`FRAME',0)
+
+	pushl	%ebx		FRAME_pushl()
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)	C dst[2]
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)	C dst[3]
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	popl	%ebx
+
+	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX,  -4)
+defframe(SAVE_ESI,  -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+	subl	$STACK_SPACE, %esp
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	%ebx, SAVE_EBX
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+	movl	%edx, 12(%ecx)
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+	movl	%edx, 20(%ecx)
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+	movl	%edx, %edi
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebp
+
+	movl	4(%ebx), %eax
+	adcl	%edx, %ebp
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+	movl	4(%ecx), %eax
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+
+	adcl	$0, %ebx
+	addl	%eax, %esi
+	movl	8(%ecx), %eax
+
+	adcl	%eax, %edi
+	movl	12(%ecx), %eax
+	movl	%esi, 4(%ecx)
+
+	adcl	%eax, %ebp
+	movl	16(%ecx), %eax
+	movl	%edi, 8(%ecx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+
+	adcl	%eax, %edx
+	movl	20(%ecx), %eax
+	movl	%ebp, 12(%ecx)
+
+	adcl	%ebx, %eax
+	ASSERT(nc)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 16(%ecx)
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP',    -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+	movl	%ebx, SAVE_EBX
+	movl	%edi, SAVE_EDI
+	leal	(%edx,%ecx,4), %edi	C &dst[size]
+
+	movl	%esi, SAVE_ESI
+	movl	%ebp, SAVE_EBP
+	leal	(%eax,%ecx,4), %esi	C &src[size]
+
+	movl	(%eax), %ebp		C multiplier
+	movl	$0, %ebx
+	decl	%ecx
+
+	negl	%ecx
+	subl	$EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time.  There seems no way to avoid this.
+
+dnl  This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl  Because the last chunk of code is generated differently, a label placed
+dnl  at the end doesn't work.  Instead calculate the implied end using the
+dnl  start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+	C The calculated jump mustn't come out to before the start of the
+	C code available.  This is the limit UNROLL_COUNT puts on the src
+	C operand size, but checked here directly using the jump address.
+	ASSERT(ae,
+	`movl_text_address(L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
+
+	movl	%edx, VAR_COUNTER
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+	movl	%edx, %ebx	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+
+	leal	CODE_BYTES_PER_LIMB(%edx), %eax
+	xorl	%edx, %edx
+	leal	4(%edi), %edi
+
+	movl	%eax, VAR_JMP
+
+	jmp	*%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+	addl	%edx, %ecx
+	ret_internal
+')
+
+
+	C Must be an even address to preserve the significance of the low
+	C bit of the jump address indicating which way around ecx/ebx should
+	C start.
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	adcl	%edx, %ebx
+
+	mull	%ebp
+
+Zdisp(  addl,	%ecx, disp_dst,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+',`
+	dnl  this bit comes out last
+Zdisp(  movl,	disp_src,(%esi), %eax)
+	adcl	%edx, %ecx
+
+	mull	%ebp
+
+Zdisp(	addl,	%ebx, disp_dst,(%edi))
+
+ifelse(forloop_last,0,
+`	movl	$0, %ebx')
+
+	adcl	%eax, %ecx
+')
+')
+
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	adcl	$0, %edx
+	addl	%ecx, -4+OFFSET(%edi)
+	movl	VAR_JMP, %ecx
+
+	adcl	$0, %edx
+
+	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
+	movl	VAR_COUNTER, %edx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%esi), %ebp
+	movl	-8(%esi), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	movl	-4(%esi), %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	movl	%eax, %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+	addl	%eax, (%edi)
+	movl	%esi, %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+
+	mull	%ecx
+
+	addl	%ebx, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+	movl	%edx, 8(%edi)
+
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edi
+	xorl	%ecx, %ecx		C clear carry
+
+	leal	(%edi,%eax,8), %edi
+	notl	%eax			C -size-1, preserve carry
+
+	leal	2(%eax), %eax		C -(size-1)
+
+L(lshift):
+	C eax	counter, negative
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	dst, pointing just after last limb
+	C ebp
+
+	rcll	-4(%edi,%eax,8)
+	rcll	(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+	setc	%al
+
+	movl	PARAM_SRC, %esi
+	movl	%eax, -4(%edi)		C dst most significant limb
+
+	movl	PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+
+	mull	%eax
+
+	leal	(%esi,%ecx,4), %esi	C src point just after last limb
+	negl	%ecx
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+	incl	%ecx
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	src just after last limb
+	C edi	dst just after last limb
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, -4(%edi,%ecx,8)
+	adcl	%eax, (%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, -4(%edi)		C dst most significant limb
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/sublsh1_n.asm b/third_party/gmp/mpn/x86/k7/sublsh1_n.asm
new file mode 100644
index 0000000..8851683
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/sublsh1_n.asm

@@ -0,0 +1,173 @@
+dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns.  The
+C innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 6.75
+C AMD K6
+C AMD K7
+C AMD K8
+
+C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
+C processors.  It uses 2*4-way unrolling, for good reasons.
+C
+C Breaking carry recurrency might be a good idea.  We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force us to 2*2-way unrolling.
+
+defframe(PARAM_SIZE,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+dnl  re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sublsh1_n_ip1)
+deflit(`FRAME',0)
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+
+	mov	PARAM_SIZE, %eax	C size
+	push	up			FRAME_pushl()
+	push	rp			FRAME_pushl()
+	xor	%edx, %edx
+	mov	PARAM_SRC, up
+	mov	PARAM_DST, rp
+	mov	%ebx, SAVE_EBX
+	mov	%eax, %ebx
+	shr	$3, %eax
+
+	not	%eax			C count = -(size\8)-i
+	and	$7, %ebx		C size % 8
+	jz	L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(up), %ecx
+	adc	%ecx, %ecx
+	rcr	%edx			C restore 1st saved carry bit
+	lea	4(up), up
+	sbb	%ecx, (rp)
+	lea	4(rp), rp
+	adc	%edx, %edx		C save a carry bit in edx
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	dec	%ebx
+	jnz	L(oop)
+L(exact):
+	inc	%eax
+	jz	L(end)
+	mov	%eax, VAR_COUNT
+	mov	%ebp, SAVE_EBP
+
+	ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(up), %eax
+	adc	%eax, %eax
+	mov	4(up), %ebx
+	adc	%ebx, %ebx
+	mov	8(up), %ecx
+	adc	%ecx, %ecx
+	mov	12(up), %ebp
+	adc	%ebp, %ebp
+
+	rcr	%edx			C restore 1st saved carry bit
+
+	sbb	%eax, (rp)
+	sbb	%ebx, 4(rp)
+	sbb	%ecx, 8(rp)
+	sbb	%ebp, 12(rp)
+
+	mov	16(up), %eax
+	adc	%eax, %eax
+	mov	20(up), %ebx
+	adc	%ebx, %ebx
+	mov	24(up), %ecx
+	adc	%ecx, %ecx
+	mov	28(up), %ebp
+	adc	%ebp, %ebp
+
+	lea	32(up), up
+	adc	%edx, %edx		C save a carry bit in edx
+
+	sbb	%eax, 16(rp)
+	sbb	%ebx, 20(rp)
+	sbb	%ecx, 24(rp)
+	sbb	%ebp, 28(rp)
+
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	incl	VAR_COUNT
+	lea	32(rp), rp
+	jne	L(top)
+
+	mov	SAVE_EBP, %ebp
+L(end):
+	mov	SAVE_EBX, %ebx
+
+ifdef(`CPU_P6',`
+	xor	%eax, %eax
+	shr	$1, %edx
+	adc	%edx, %eax
+',`
+	adc	$0, %edx
+	mov	%edx, %eax
+')
+	pop	rp			FRAME_popl()
+	pop	up			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k8/gmp-mparam.h b/third_party/gmp/mpn/x86/k8/gmp-mparam.h
new file mode 100644
index 0000000..fa71292
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k8/gmp-mparam.h

@@ -0,0 +1,215 @@
+/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2500 MHz K8 Brisbane */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     21
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 36.85% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           44
+
+#define DIV_1_VS_MUL_1_PERCENT             251
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                78
+#define MUL_TOOM44_THRESHOLD               136
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      96
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     121
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 46
+#define SQR_TOOM3_THRESHOLD                 81
+#define SQR_TOOM4_THRESHOLD                202
+#define SQR_TOOM6_THRESHOLD                300
+#define SQR_TOOM8_THRESHOLD                430
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               22
+
+#define MUL_FFT_MODF_THRESHOLD             606  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    606, 5}, {     27, 6}, {     15, 5}, {     31, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    927,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    895,11}, {   1791,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2431,13}, {   1279,12}, \
+    {   2623,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4351,13}, {   2431,14}, \
+    {   1279,13}, {   2943,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4351,14}, {   2303,13}, {   4991,14}, {   2815,15}, \
+    {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 158
+#define MUL_FFT_THRESHOLD                 7296
+
+#define SQR_FFT_MODF_THRESHOLD             500  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    500, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607, 9}, {   1215,11}, {    319,10}, {    671,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,11}, {   2431,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2431,14}, {   1279,13}, {   2943,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,15}, {   1023,14}, \
+    {   2047,13}, {   4223,14}, {   2303,13}, {   4863,14}, \
+    {   2815,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 167
+#define SQR_FFT_THRESHOLD                 5504
+
+#define MULLO_BASECASE_THRESHOLD             4
+#define MULLO_DC_THRESHOLD                  29
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                 193
+#define SQRLO_SQR_THRESHOLD              10704
+
+#define DC_DIV_QR_THRESHOLD                 84
+#define DC_DIVAPPR_Q_THRESHOLD             278
+#define DC_BDIV_QR_THRESHOLD                87
+#define DC_BDIV_Q_THRESHOLD                216
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               268
+#define INV_APPR_THRESHOLD                 268
+
+#define BINV_NEWTON_THRESHOLD              276
+#define REDC_1_TO_REDC_N_THRESHOLD          78
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              114
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1466
+
+#define POWM_SEC_TABLE  1,22,102,452,1357
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD      1149
+
+#define FAC_DSC_THRESHOLD                  208
+#define FAC_ODD_THRESHOLD                   48
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    3  /* 4.69% faster than 1 */
+#define HGCD_THRESHOLD                     139
+#define HGCD_APPR_THRESHOLD                174
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   599
+#define GCDEXT_DC_THRESHOLD                419
+#define JACOBI_BASE_METHOD                   1  /* 1.57% faster than 4 */
+
+/* Tuneup completed successfully, took 83851 seconds */

diff --git a/third_party/gmp/mpn/x86/lshift.asm b/third_party/gmp/mpn/x86/lshift.asm
new file mode 100644
index 0000000..6ee6153
--- /dev/null
+++ b/third_party/gmp/mpn/x86/lshift.asm

@@ -0,0 +1,106 @@
+dnl  x86 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54	 7.5
+C P55	 7.0
+C P6	 2.5
+C K6	 4.5
+C K7	 5.0
+C P4	14.5
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	subl	$4,%esi			C adjust src
+
+	movl	(%esi,%edx,4),%ebx	C read most significant limb
+	xorl	%eax,%eax
+	shldl(	%cl, %ebx, %eax)	C compute carry limb
+	decl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next lower limb
+	shldl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	decl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shldl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	decl	%edx
+	jnz	L(oop)
+
+	shll	%cl,%eax		C compute least significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shll	%cl,%ebx		C compute least significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mmx/sec_tabselect.asm b/third_party/gmp/mpn/x86/mmx/sec_tabselect.asm
new file mode 100644
index 0000000..aae158a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mmx/sec_tabselect.asm

@@ -0,0 +1,163 @@
+dnl  X86 MMX mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			     cycles/limb     cycles/limb
+C			      ali,evn n	     unal,evn n
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 1.33		 1.87
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	 2.1		 2.63
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)		 1.7		 2.57
+C Intel Atom			 1.85		 2.7
+C AMD K6
+C AMD K7			 1.33		 1.33
+C AMD K8
+C AMD K10
+
+define(`rp',     `%edi')
+define(`tp',     `%esi')
+define(`n',      `%edx')
+define(`nents',  `%ecx')
+define(`which',  `')
+
+define(`i',      `%ebp')
+define(`j',      `%ebx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+	push	%ebx
+	push	%esi
+	push	%edi
+	push	%ebp
+
+	mov	20(%esp), rp
+	mov	24(%esp), tp
+	mov	28(%esp), n
+	mov	32(%esp), nents
+
+	movd	36(%esp), %mm6
+	punpckldq %mm6, %mm6		C 2 copies of `which'
+
+	mov	$1, %ebx
+	movd	%ebx, %mm7
+	punpckldq %mm7, %mm7		C 2 copies of 1
+
+	mov	n, j
+	add	$-4, j
+	js	L(outer_end)
+
+L(outer_top):
+	mov	nents, i
+	mov	tp, %eax
+	pxor	%mm1, %mm1
+	pxor	%mm4, %mm4
+	pxor	%mm5, %mm5
+	ALIGN(16)
+L(top):	movq	%mm6, %mm0
+	pcmpeqd	%mm1, %mm0
+	paddd	%mm7, %mm1
+	movq	(tp), %mm2
+	movq	8(tp), %mm3
+	pand	%mm0, %mm2
+	pand	%mm0, %mm3
+	por	%mm2, %mm4
+	por	%mm3, %mm5
+	lea	(tp,n,4), tp
+	add	$-1, i
+	jne	L(top)
+
+	movq	%mm4, (rp)
+	movq	%mm5, 8(rp)
+
+	lea	16(%eax), tp
+	lea	16(rp), rp
+	add	$-4, j
+	jns	L(outer_top)
+L(outer_end):
+
+	test	$2, %dl
+	jz	L(b0x)
+
+L(b1x):	mov	nents, i
+	mov	tp, %eax
+	pxor	%mm1, %mm1
+	pxor	%mm4, %mm4
+	ALIGN(16)
+L(tp2):	movq	%mm6, %mm0
+	pcmpeqd	%mm1, %mm0
+	paddd	%mm7, %mm1
+	movq	(tp), %mm2
+	pand	%mm0, %mm2
+	por	%mm2, %mm4
+	lea	(tp,n,4), tp
+	add	$-1, i
+	jne	L(tp2)
+
+	movq	%mm4, (rp)
+
+	lea	8(%eax), tp
+	lea	8(rp), rp
+
+L(b0x):	test	$1, %dl
+	jz	L(b00)
+
+L(b01):	mov	nents, i
+	pxor	%mm1, %mm1
+	pxor	%mm4, %mm4
+	ALIGN(16)
+L(tp1):	movq	%mm6, %mm0
+	pcmpeqd	%mm1, %mm0
+	paddd	%mm7, %mm1
+	movd	(tp), %mm2
+	pand	%mm0, %mm2
+	por	%mm2, %mm4
+	lea	(tp,n,4), tp
+	add	$-1, i
+	jne	L(tp1)
+
+	movd	%mm4, (rp)
+
+L(b00):	pop	%ebp
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	emms
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mod_34lsub1.asm b/third_party/gmp/mpn/x86/mod_34lsub1.asm
new file mode 100644
index 0000000..e09e702
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mod_34lsub1.asm

@@ -0,0 +1,183 @@
+dnl  Generic x86 mpn_mod_34lsub1 -- mpn remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C P5	  3.0
+C P6	  3.66
+C K6	  3.0
+C K7	  1.3
+C P4	  9
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+
+	movl	(%edx), %eax
+	jb	L(one)
+
+	movl	4(%edx), %ecx
+	movl	%eax, %edx
+	shrl	$24, %eax		C src[0] low
+
+	andl	$0xFFFFFF, %edx		C src[0] high
+	addl	%edx, %eax
+	movl	%ecx, %edx
+
+	andl	$0xFFFF, %ecx
+	shrl	$16, %edx		C src[1] high
+	addl	%edx, %eax
+
+	shll	$8, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+	C ebp
+
+	movl	%ebx, SAVE_EBX		C and arrange 16-byte loop alignment
+	xorl	%ebx, %ebx
+
+	pushl	%esi	FRAME_pushl()
+	xorl	%esi, %esi
+
+	pushl	%edi	FRAME_pushl()
+	xorl	%eax, %eax		C and clear carry flag
+
+
+	C offset 0x40 here
+L(top):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx	counter, limbs
+	C edx	src
+	C esi	acc 2mod3
+	C edi
+	C ebp
+
+	leal	12(%edx), %edx
+	leal	-2(%ecx), %ecx
+
+	adcl	-12(%edx), %eax
+	adcl	-8(%edx), %ebx
+	adcl	-4(%edx), %esi
+
+	decl	%ecx
+	jg	L(top)
+
+
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+	movl	$0xFFFFFFFF, %edi
+	incl	%ecx
+	js	L(combine)
+
+	adcl	(%edx), %eax
+	movl	$0xFFFFFF00, %edi
+	decl	%ecx
+	js	L(combine)
+
+	adcl	4(%edx), %ebx
+	movl	$0xFFFF0000, %edi
+
+
+L(combine):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx
+	C edx
+	C esi	acc 2mod3
+	C edi	mask
+	C ebp
+
+	sbbl	%ecx, %ecx		C carry
+	movl	%eax, %edx		C 0mod3
+
+	shrl	$24, %eax		C 0mod3 high
+	andl	%edi, %ecx		C carry masked
+
+	subl	%ecx, %eax		C apply carry
+	movl	%ebx, %edi		C 1mod3
+
+	shrl	$16, %ebx		C 1mod3 high
+	andl	$0x00FFFFFF, %edx	C 0mod3 low
+
+	addl	%edx, %eax		C apply 0mod3 low
+	andl	$0xFFFF, %edi
+
+	shll	$8, %edi		C 1mod3 low
+	addl	%ebx, %eax		C apply 1mod3 high
+
+	addl	%edi, %eax		C apply 1mod3 low
+	movl	%esi, %edx		C 2mod3
+
+	shrl	$8, %esi		C 2mod3 high
+	andl	$0xFF, %edx		C 2mod3 low
+
+	shll	$16, %edx		C 2mod3 low
+	addl	%esi, %eax		C apply 2mod3 high
+
+	addl	%edx, %eax		C apply 2mod3 low
+	popl	%edi	FRAME_popl()
+
+	movl	SAVE_EBX, %ebx
+	popl	%esi	FRAME_popl()
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mul_1.asm b/third_party/gmp/mpn/x86/mul_1.asm
new file mode 100644
index 0000000..421de62
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mul_1.asm

@@ -0,0 +1,140 @@
+dnl  x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
+dnl  with a limb and store the result in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1997-2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5				12.5
+C P6 model 0-8,10-12		 5.5
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 5.25
+C P4 model 0  (Willamette)	19.0
+C P4 model 1  (?)		19.0
+C P4 model 2  (Northwood)	19.0
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6			10.5
+C AMD K7			 4.5
+C AMD K8
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebp,4(%edi)
+	addl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,8(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	%ebp,12(%edi)
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mul_basecase.asm b/third_party/gmp/mpn/x86/mul_basecase.asm
new file mode 100644
index 0000000..8339732
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mul_basecase.asm

@@ -0,0 +1,223 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl  in a third limb vector.
+
+dnl  Copyright 1996-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/crossproduct
+C P5	  15
+C P6	   7.5
+C K6	  12.5
+C K7	   5.5
+C P4	  24
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II.  This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4.  That
+C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER,    -8)
+deflit(VAR_STACK_SPACE,  8)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	subl	$VAR_STACK_SPACE,%esp
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+
+	leal	4(%esi),%esi
+	leal	4(%edi),%edi
+
+L(oopM):
+	movl	(%esi),%eax		C load next limb at xp[j]
+	leal	4(%esi),%esi
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%edx,%ebx
+	adcl	$0,%ebx
+	movl	%eax,(%edi)
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oopM)
+
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	PARAM_YSIZE,%eax	C ysize
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	movl	PARAM_YP,%ebp		C yp
+	addl	$4,%ebp			C make ebp point to next v limb
+	movl	%ebp,PARAM_YP
+	movl	(%ebp),%eax		C copy y limb ...
+	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
+	movl	PARAM_XSIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	addl	%eax,(%edi)
+	adcl	%edx,%ebx		C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_XSIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(endX)
+
+	ALIGN(8)
+L(oopX):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	addl	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oopX)
+
+L(endX):
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+
+	C we incremented wp and xp in the loop above; compensate
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	   C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/nano/gmp-mparam.h b/third_party/gmp/mpn/x86/nano/gmp-mparam.h
new file mode 100644
index 0000000..cd8ac4e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/nano/gmp-mparam.h

@@ -0,0 +1,162 @@
+/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        53
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           32
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD               132
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     130
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     135
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                194
+#define SQR_TOOM4_THRESHOLD                502
+#define SQR_TOOM6_THRESHOLD                746
+#define SQR_TOOM8_THRESHOLD               1005
+
+#define MULMID_TOOM42_THRESHOLD             40
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define POWM_SEC_TABLE  4,23,258,828,2246
+
+#define MUL_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    308, 5}, {     13, 6}, {      7, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     11, 5}, {     23, 6}, \
+    {     13, 7}, {      7, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     24, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 8}, {     11, 7}, {     25, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     47,10}, \
+    {     31, 9}, {     71,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    543, 9}, \
+    {    287, 8}, {    575, 7}, {   1215,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    543, 8}, {   1087,10}, {    287, 9}, \
+    {    607, 8}, {   1215,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703, 8}, {   1407, 9}, \
+    {    735, 8}, {   1471,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895,10}, {    479, 9}, {    959, 8}, \
+    {   1919,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD                 1856
+
+#define SQR_FFT_MODF_THRESHOLD             396  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    396, 5}, {     13, 6}, {      7, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     15, 6}, {     31, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 8}, {     11, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    543,10}, {    143, 9}, \
+    {    287, 8}, {    607, 7}, {   1215, 6}, {   2431,10}, \
+    {    159, 8}, {    639,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    607, 8}, {   1215,11}, \
+    {    159,10}, {    319, 9}, {    671,10}, {    351, 9}, \
+    {    703, 8}, {   1407, 9}, {    735, 8}, {   1471, 7}, \
+    {   2943,11}, {    191,10}, {    383, 9}, {    799,10}, \
+    {    415, 9}, {    895,10}, {    479,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD                 2368
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  51
+#define MULLO_MUL_N_THRESHOLD             3369
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             183
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                118
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               266
+#define INV_APPR_THRESHOLD                 218
+
+#define BINV_NEWTON_THRESHOLD              268
+#define REDC_1_TO_REDC_N_THRESHOLD          56
+
+#define MU_DIV_QR_THRESHOLD               1308
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              124
+#define MU_BDIV_QR_THRESHOLD               855
+#define MU_BDIV_Q_THRESHOLD               1334
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD_THRESHOLD                     104
+#define HGCD_APPR_THRESHOLD                139
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   456
+#define GCDEXT_DC_THRESHOLD                321
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD               542
+#define SET_STR_PRECOMPUTE_THRESHOLD       840

diff --git a/third_party/gmp/mpn/x86/p6/README b/third_party/gmp/mpn/x86/p6/README
new file mode 100644
index 0000000..f19d47b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/README

@@ -0,0 +1,125 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                      INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III.  The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+                               cycles/limb
+
+	mpn_add_n/sub_n           3.7
+
+	mpn_copyi                 0.75
+	mpn_copyd                 1.75 (or 0.75 if no overlap)
+
+	mpn_divrem_1             39.0
+	mpn_mod_1                21.5
+	mpn_divexact_by3          8.5
+
+	mpn_mul_1                 5.5
+	mpn_addmul/submul_1       6.35
+
+	mpn_l/rshift              2.5
+
+	mpn_mul_basecase          8.2 cycles/crossproduct (approx)
+	mpn_sqr_basecase          4.0 cycles/crossproduct (approx)
+				  or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+	mpn_divrem_1             25.0 integer part, 17.5 fractional part
+
+	mpn_l/rshift              1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone.  The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same.  The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/p6/aors_n.asm b/third_party/gmp/mpn/x86/p6/aors_n.asm
new file mode 100644
index 0000000..df51c2e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/aors_n.asm

@@ -0,0 +1,156 @@
+dnl  Intel P6 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Avoid indexed addressing, it makes us stall on the two-ported register
+C    file.
+
+C			    cycles/limb
+C P6 model 0-8,10-12		3.17
+C P6 model 9   (Banias)		2.15
+C P6 model 13  (Dothan)		2.25
+
+
+define(`rp',	`%edi')
+define(`up',	`%esi')
+define(`vp',	`%ebx')
+define(`n',	`%ecx')
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(func)
+	xor	%edx, %edx
+L(start):
+	push	%edi
+	push	%esi
+	push	%ebx
+
+	mov	16(%esp), rp
+	mov	20(%esp), up
+	mov	24(%esp), vp
+	mov	28(%esp), n
+
+	lea	(up,n,4), up
+	lea	(vp,n,4), vp
+	lea	(rp,n,4), rp
+
+	neg	n
+	mov	n, %eax
+	and	$-8, n
+	and	$7, %eax
+	shl	$2, %eax			C 4x
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	lea	L(ent) (%eax,%eax,2), %eax	C 12x
+')
+
+	shr	%edx				C set cy flag
+	jmp	*%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	lea	(%eax,%eax,2), %eax
+	add	$L(ent)-L(here), %eax
+	add	(%esp), %eax
+	ret_internal
+')
+
+L(end):
+	sbb	%eax, %eax
+	neg	%eax
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+
+	ALIGN(16)
+L(top):
+	jecxz	L(end)
+L(ent):
+Zdisp(	mov,	0,(up,n,4), %eax)
+Zdisp(	ADCSBB,	0,(vp,n,4), %eax)
+Zdisp(	mov,	%eax, 0,(rp,n,4))
+
+	mov	4(up,n,4), %edx
+	ADCSBB	4(vp,n,4), %edx
+	mov	%edx, 4(rp,n,4)
+
+	mov	8(up,n,4), %eax
+	ADCSBB	8(vp,n,4), %eax
+	mov	%eax, 8(rp,n,4)
+
+	mov	12(up,n,4), %edx
+	ADCSBB	12(vp,n,4), %edx
+	mov	%edx, 12(rp,n,4)
+
+	mov	16(up,n,4), %eax
+	ADCSBB	16(vp,n,4), %eax
+	mov	%eax, 16(rp,n,4)
+
+	mov	20(up,n,4), %edx
+	ADCSBB	20(vp,n,4), %edx
+	mov	%edx, 20(rp,n,4)
+
+	mov	24(up,n,4), %eax
+	ADCSBB	24(vp,n,4), %eax
+	mov	%eax, 24(rp,n,4)
+
+	mov	28(up,n,4), %edx
+	ADCSBB	28(vp,n,4), %edx
+	mov	%edx, 28(rp,n,4)
+
+	lea	8(n), n
+	jmp	L(top)
+
+EPILOGUE()
+
+PROLOGUE(func_nc)
+	movl	20(%esp), %edx
+	jmp	L(start)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/aorsmul_1.asm b/third_party/gmp/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000..bc8c49c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/aorsmul_1.asm

@@ -0,0 +1,320 @@
+dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12		 6.44
+C P6 model 9  (Banias)		 6.15
+C P6 model 13 (Dothan)		 6.11
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6
+C AMD K7
+C AMD K8
+
+
+dnl  P6 UNROLL_COUNT cycles/limb
+dnl          8           6.7
+dnl         16           6.35
+dnl         32           6.3
+dnl         64           6.3
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code.  The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%ebx
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %ebx
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%ebx
+deflit(`FRAME',4)
+	xorl	%ebx, %ebx	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi
+deflit(`FRAME',12)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+deflit(`FRAME',16)
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	jae	L(unroll)
+
+
+	C simple loop
+	C this is offset 0x22, so close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi), %eax
+	addl	$4, %edi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	decl	%ecx
+
+	leal	4(%esi), %esi
+	jnz	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	movl	%ebx, %eax
+
+	popl	%ebx
+	ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points.  Duplicating the startup code to
+C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_DST')
+
+	C this is offset 0x43, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	shll	$4, %edx
+	negl	%ecx
+
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%esi), %eax		C src low limb
+
+	movl	%edx, VAR_JUMP
+	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax	C initial carry (from _1c)
+	adcl	$0, %edx
+
+	movl	%edx, %ebx	C high carry
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+	movl	VAR_JUMP, %edx
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	cmovnz(	%ebx, %ecx)	C high,low carry other way around
+	cmovnz(	%eax, %ebx)
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	shll	$4, %edx
+	negl	%ecx
+
+	C See mpn/x86/README about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+
+	addl	(%esp), %edx
+
+	ret_internal
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	carry hi
+	C ecx	carry lo
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER	loop counter
+	C
+	C 15 code bytes per limb
+
+	addl	$UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+
+	movl	disp1(%esi), %eax
+	mull	%ebp
+	M4_inst	%ebx, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+
+	jns	L(top)
+
+
+deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	%ebx, %eax
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+	adcl	$0, %eax
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/bdiv_q_1.asm b/third_party/gmp/mpn/x86/p6/bdiv_q_1.asm
new file mode 100644
index 0000000..a0a9d90
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/bdiv_q_1.asm

@@ -0,0 +1,287 @@
+dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       odd  even  divisor
+C P6:  10.0  12.0  cycles/limb
+
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,     8)
+defframe(PARAM_DST,     4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+deflit(STACK_SPACE, 16)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+
+	ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_INVERSE, %ebp
+
+	movl	PARAM_SHIFT, %ecx	C trailing twos
+
+L(common):
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	(%esi,%ebx,4), %esi	C src end
+
+	leal	(%edi,%ebx,4), %edi	C dst end
+	negl	%ebx			C -size
+
+	movl	(%esi,%ebx,4), %eax	C src[0]
+
+	orl	%ecx, %ecx
+	jz	L(odd_entry)
+
+	movl	%edi, PARAM_DST
+	movl	%ebp, VAR_INVERSE
+
+L(even):
+	C eax	src[0]
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx
+	C esi
+	C edi
+	C ebp
+
+	xorl	%ebp, %ebp		C initial carry bit
+	xorl	%edx, %edx		C initial carry limb (for size==1)
+
+	incl	%ebx
+	jz	L(even_one)
+
+	movl	(%esi,%ebx,4), %edi	C src[1]
+
+	shrdl(	%cl, %edi, %eax)
+
+	jmp	L(even_entry)
+
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size] and scratch
+	C ebp	carry bit
+
+	movl	(%esi,%ebx,4), %edi
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi,%ebx,4), %eax
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebp, %eax
+
+	sbbl	%ebp, %ebp
+	subl	%edx, %eax
+
+	sbbl	$0, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	PARAM_DST, %edi
+	negl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+	jnz	L(even_top)
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax
+
+L(even_one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebp, %eax
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+C The dependent chain here is
+C
+C	subl	%edx, %eax       1
+C	imull	%ebp, %eax       4
+C	mull	PARAM_DIVISOR    5
+C			       ----
+C	total			10
+C
+C and this is the measured speed.  No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+	C eax	scratch (src limb)
+	C ebx	counter, limbs, negative
+	C ecx	carry bit
+	C edx	carry limb, high of last product
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	inverse
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ecx, %eax
+
+	sbbl	%ecx, %ecx
+	subl	%edx, %eax
+
+	sbbl	$0, %ecx
+
+L(odd_entry):
+	imull	%ebp, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+	negl	%ecx
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	bsfl	%eax, %ecx		C trailing twos
+
+	movl	%ebp, SAVE_EBP
+
+	shrl	%cl, %eax		C d without twos
+
+	movl	%eax, %edx
+	shrl	%eax			C d/2 without twos
+
+	movl	%edx, PARAM_DIVISOR
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
+')
+
+	leal	(%ebp,%ebp), %eax	C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+	imull	%edx, %ebp	C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %ebp	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+	imull	%edx, %eax	C inv*inv*d
+
+	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
+
+	jmp	L(common)
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/p6/copyd.asm b/third_party/gmp/mpn/x86/p6/copyd.asm
new file mode 100644
index 0000000..1be7636
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/copyd.asm

@@ -0,0 +1,178 @@
+dnl  Intel P6 mpn_copyd -- copy limb vector backwards.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 1.75 cycles/limb, or 0.75 if no overlap
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C An explicit loop is used because a decrementing rep movsl is a bit slow at
+C 2.4 c/l.  That rep movsl also has about a 40 cycle startup time, and the
+C code here stands a chance of being faster if the branches predict well.
+C
+C The slightly strange loop form seems necessary for the claimed speed.
+C Maybe load/store ordering affects it.
+C
+C The source and destination are checked to see if they're actually
+C overlapping, since it might be possible to use an incrementing rep movsl
+C at 0.75 c/l.  (It doesn't suffer the bad startup time of the decrementing
+C version.)
+C
+C Enhancements:
+C
+C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
+C one store each cycle.  Unrolling the loop below would approach 1.0, but
+C it'd be good to know why something like store/load/subl + store/load/jnz
+C doesn't already run at 1.0 c/l.  It looks like it should decode in 2
+C cycles, but doesn't run that way.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  re-using parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EDI,`PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	subl	$1, %ecx
+	jb	L(zero)
+
+	movl	(%esi,%ecx,4), %eax		C src[size-1]
+	jz	L(one)
+
+	movl	-4(%esi,%ecx,4), %edx		C src[size-2]
+	subl	$2, %ecx
+	jbe	L(done_loop)			C 2 or 3 limbs only
+
+
+	C The usual overlap is
+	C
+	C     high                   low
+	C     +------------------+
+	C     |               dst|
+	C     +------------------+
+	C           +------------------+
+	C           |               src|
+	C           +------------------+
+	C
+	C We can use an incrementing copy in the following circumstances.
+	C
+	C     src+4*size<=dst, since then the regions are disjoint
+	C
+	C     src==dst, clearly (though this shouldn't occur normally)
+	C
+	C     src>dst, since in that case it's a requirement of the
+	C              parameters that src>=dst+size*4, and hence the
+	C              regions are disjoint
+	C
+
+	leal	(%edi,%ecx,4), %edx
+	cmpl	%edi, %esi
+	jae	L(use_movsl)		C src >= dst
+
+	cmpl	%edi, %edx
+	movl	4(%esi,%ecx,4), %edx	C src[size-2] again
+	jbe	L(use_movsl)		C src+4*size <= dst
+
+
+L(top):
+	C eax	prev high limb
+	C ebx
+	C ecx	counter, size-3 down to 0 or -1, inclusive, by 2s
+	C edx	prev low limb
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+
+	movl	%edx, 4(%edi,%ecx,4)
+	movl	-4(%esi,%ecx,4), %edx
+
+	subl	$2, %ecx
+	jnbe	L(top)
+
+
+L(done_loop):
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	%edx, 4(%edi,%ecx,4)
+
+	C copy low limb (needed if size was odd, but will already have been
+	C done in the loop if size was even)
+	movl	(%esi), %eax
+L(one):
+	movl	%eax, (%edi)
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	ret
+
+
+L(use_movsl):
+	C eax
+	C ebx
+	C ecx	size-3
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+
+	addl	$3, %ecx
+
+	cld		C better safe than sorry, see mpn/x86/README
+
+	rep
+	movsl
+
+L(zero):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/dive_1.asm b/third_party/gmp/mpn/x86/p6/dive_1.asm
new file mode 100644
index 0000000..7d61a18
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/dive_1.asm

@@ -0,0 +1,267 @@
+dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       odd  even  divisor
+C P6:  10.0  12.0  cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,     8)
+defframe(PARAM_DST,     4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(VAR_INVERSE, -20)
+deflit(STACK_SPACE, 20)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	bsfl	%eax, %ecx		C trailing twos
+
+	movl	%ebp, SAVE_EBP
+
+	shrl	%cl, %eax		C d without twos
+
+	movl	%eax, %edx
+	shrl	%eax			C d/2 without twos
+
+	movl	%edx, PARAM_DIVISOR
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
+')
+
+	leal	(%ebp,%ebp), %eax	C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	(%esi,%ebx,4), %esi	C src end
+
+	imull	PARAM_DIVISOR, %ebp	C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %ebp	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	leal	(%edi,%ebx,4), %edi	C dst end
+	negl	%ebx			C -size
+
+	movl	%edi, PARAM_DST
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	PARAM_DIVISOR, %eax
+	imull	%ebp, %eax
+	cmpl	$1, %eax')
+
+	movl	%ebp, VAR_INVERSE
+	movl	(%esi,%ebx,4), %eax	C src[0]
+
+	orl	%ecx, %ecx
+	jnz	L(even)
+
+	C ecx initial carry is zero
+	jmp	L(odd_entry)
+
+
+C The dependent chain here is
+C
+C	subl	%edx, %eax       1
+C	imull	%ebp, %eax       4
+C	mull	PARAM_DIVISOR    5
+C			       ----
+C	total			10
+C
+C and this is the measured speed.  No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+	C eax	scratch (src limb)
+	C ebx	counter, limbs, negative
+	C ecx	carry bit
+	C edx	carry limb, high of last product
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ecx, %eax
+
+	sbbl	%ecx, %ecx
+	subl	%edx, %eax
+
+	sbbl	$0, %ecx
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+	negl	%ecx
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(even):
+	C eax	src[0]
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx
+	C esi
+	C edi
+	C ebp
+
+	xorl	%ebp, %ebp		C initial carry bit
+	xorl	%edx, %edx		C initial carry limb (for size==1)
+
+	incl	%ebx
+	jz	L(even_one)
+
+	movl	(%esi,%ebx,4), %edi	C src[1]
+
+	shrdl(	%cl, %edi, %eax)
+
+	jmp	L(even_entry)
+
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size] and scratch
+	C ebp	carry bit
+
+	movl	(%esi,%ebx,4), %edi
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi,%ebx,4), %eax
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebp, %eax
+
+	sbbl	%ebp, %ebp
+	subl	%edx, %eax
+
+	sbbl	$0, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	PARAM_DST, %edi
+	negl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+	jnz	L(even_top)
+
+
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax
+
+L(even_one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebp, %eax
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/p6/gcd_11.asm b/third_party/gmp/mpn/x86/p6/gcd_11.asm
new file mode 100644
index 0000000..80e055e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/gcd_11.asm

@@ -0,0 +1,83 @@
+dnl  x86 mpn_gcd_11 optimised for processors with fast BSF.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked by Torbjorn Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C AMD K7	 7.80
+C AMD K8,K9	 7.79
+C AMD K10	 4.08
+C AMD bd1	 ?
+C AMD bobcat	 7.82
+C Intel P4-2	14.9
+C Intel P4-3/4	14.0
+C Intel P6/13	 5.09
+C Intel core2	 4.22
+C Intel NHM	 5.00
+C Intel SBR	 5.00
+C Intel atom	17.1
+C VIA nano	?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+
+define(`u0',    `%eax')
+define(`v0',    `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	push	%edi
+	push	%esi
+
+	mov	12(%esp), %eax
+	mov	16(%esp), %edx
+	jmp	L(odd)
+
+	ALIGN(16)		C               K10   BD    C2    NHM   SBR
+L(top):	cmovc(	%esi, %eax)	C u = |v - u|   0,3   0,3   0,6   0,5   0,5
+	cmovc(	%edi, %edx)	C v = min(u,v)  0,3   0,3   2,8   1,7   1,7
+	shr	%cl, %eax	C               1,7   1,6   2,8   2,8   2,8
+L(odd):	mov	%edx, %esi	C               1     1     4     3     3
+	sub	%eax, %esi	C               2     2     5     4     4
+	bsf	%esi, %ecx	C               3     3     6     5     5
+	mov	%eax, %edi	C               2     2     3     3     4
+	sub	%edx, %eax	C               2     2     4     3     4
+	jnz	L(top)		C
+
+L(end):	mov	%edx, %eax
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/gmp-mparam.h b/third_party/gmp/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000..96c96fd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/gmp-mparam.h

@@ -0,0 +1,194 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+   value in mpn/x86/p6/gmp-mparam.h.  The latter is used as a hard limit in
+   mpn/x86/p6/sqr_basecase.asm.  */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD                 4
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                74
+#define MUL_TOOM44_THRESHOLD               181
+#define MUL_TOOM6H_THRESHOLD               252
+#define MUL_TOOM8H_THRESHOLD               363
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define POWM_SEC_TABLE  4,23,258,768,2388
+
+#define MUL_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    565, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 5}, \
+    {    383, 4}, {    991, 5}, {    511, 6}, {    267, 7}, \
+    {    157, 8}, {     91, 9}, {     47, 8}, {    111, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1407,12}, {   2815,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             472  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    472, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     63, 4}, {   1023, 8}, {     67, 9}, \
+    {     39, 5}, {    639, 4}, {   1471, 6}, {    383, 7}, \
+    {    209, 8}, {    119, 9}, {     63, 7}, {    255, 8}, \
+    {    139, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159, 8}, {    319, 9}, \
+    {    167,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671, 9}, {   1343,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 20
+#define DC_DIVAPPR_Q_THRESHOLD              56
+#define DC_BDIV_QR_THRESHOLD                60
+#define DC_BDIV_Q_THRESHOLD                134
+
+#define INV_MULMOD_BNM1_THRESHOLD           38
+#define INV_NEWTON_THRESHOLD                66
+#define INV_APPR_THRESHOLD                  63
+
+#define BINV_NEWTON_THRESHOLD              250
+#define REDC_1_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1164
+#define MU_DIVAPPR_Q_THRESHOLD             979
+#define MUPI_DIV_QR_THRESHOLD               38
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD_THRESHOLD                      64
+#define HGCD_APPR_THRESHOLD                105
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   386
+#define GCDEXT_DC_THRESHOLD                309
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               587
+#define SET_STR_PRECOMPUTE_THRESHOLD      1104

diff --git a/third_party/gmp/mpn/x86/p6/lshsub_n.asm b/third_party/gmp/mpn/x86/p6/lshsub_n.asm
new file mode 100644
index 0000000..7ada213
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/lshsub_n.asm

@@ -0,0 +1,169 @@
+dnl  Intel P6 mpn_lshsub_n -- mpn papillion support.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C P6/13: 3.35 cycles/limb	(separate mpn_sub_n + mpn_lshift needs 4.12)
+
+C (1) The loop is not scheduled in any way, and scheduling attempts have not
+C     improved speed on P6/13.  Presumably, the K7 will want scheduling, if it
+C     at all wants to use MMX.
+C (2) We could save a register by not alternatingly using eax and edx in the
+C     loop.
+
+define(`rp',	`%edi')
+define(`up',	`%esi')
+define(`vp',	`%ebx')
+define(`n',	`%ecx')
+define(`cnt',	`%mm7')
+
+ASM_START()
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_lshsub_n)
+	push	%edi
+	push	%esi
+	push	%ebx
+
+	mov	16(%esp), rp
+	mov	20(%esp), up
+	mov	24(%esp), vp
+	mov	28(%esp), n
+	mov	$32, %eax
+	sub	32(%esp), %eax
+	movd	%eax, cnt
+
+	lea	(up,n,4), up
+	lea	(vp,n,4), vp
+	lea	(rp,n,4), rp
+
+	neg	n
+	mov	n, %eax
+	and	$-8, n
+	and	$7, %eax
+	shl	%eax				C eax = 2x
+	lea	(%eax,%eax,4), %edx		C edx = 10x
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	lea	L(ent)(%eax,%edx,2), %eax	C eax = 22x
+')
+
+	pxor	%mm1, %mm1
+	pxor	%mm0, %mm0
+
+	jmp	*%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	lea	(%eax,%edx,2), %eax
+	add	$L(ent)-L(here), %eax
+	add	(%esp), %eax
+	ret_internal
+')
+
+L(end):	C compute (cy<<cnt) | (edx>>(32-cnt))
+	sbb	%eax, %eax
+	neg	%eax
+	mov	32(%esp), %ecx
+	shld	%cl, %edx, %eax
+
+	emms
+
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+	ALIGN(16)
+L(top):	jecxz	L(end)
+L(ent):	mov	   0(up,n,4), %eax
+	sbb	   0(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 0(rp,n,4)
+
+	mov	   4(up,n,4), %edx
+	sbb	   4(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 4(rp,n,4)
+
+	mov	   8(up,n,4), %eax
+	sbb	   8(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 8(rp,n,4)
+
+	mov	   12(up,n,4), %edx
+	sbb	   12(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 12(rp,n,4)
+
+	mov	   16(up,n,4), %eax
+	sbb	   16(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 16(rp,n,4)
+
+	mov	   20(up,n,4), %edx
+	sbb	   20(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 20(rp,n,4)
+
+	mov	   24(up,n,4), %eax
+	sbb	   24(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 24(rp,n,4)
+
+	mov	   28(up,n,4), %edx
+	sbb	   28(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 28(rp,n,4)
+
+	lea	   8(n), n
+	jmp	   L(top)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/mmx/divrem_1.asm b/third_party/gmp/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000..5300616
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/divrem_1.asm

@@ -0,0 +1,767 @@
+dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments.  It's possible what's here can be improved.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The different speeds of the integer and fraction parts means that using
+dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
+dnl  for the integer part and a bit lower for the fraction part.  (Or what's
+dnl  really wanted is to speed up the integer part!)
+dnl
+dnl  The threshold is set to make the integer part right.  At 4 limbs the
+dnl  div and mul are about the same there, but on the fractional part the
+dnl  mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+	movl	PARAM_XSIZE, %ecx
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edx
+
+	movl	-4(%esi,%ebx,4), %eax	C src high limb
+	xorl	%edi, %edi		C initial carry (if can't skip a div)
+
+	C
+
+	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
+	xor	%ecx, %ecx
+
+	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edi)		C high is carry if high<divisor
+
+	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
+
+	sbbl	$0, %ebx		C skip one division if high<divisor
+	movl	PARAM_PREINV_SHIFT, %ecx
+
+	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
+	movl	$32, %eax
+
+	movl	%edx, VAR_DST		C &dst[xsize+size]
+
+	shll	%cl, %ebp		C d normalized
+	subl	%ecx, %eax
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7		C rshift
+	movl	PARAM_PREINV_INVERSE, %eax
+	jmp	L(start_preinv)
+
+EPILOGUE()
+
+
+
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jz	L(no_skip_div)		C if size==0
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	xorl	%esi, %esi
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+
+	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%esi, (%edi,%ecx,4)	C dst high limb
+
+	sbbl	$0, %ecx		C size-1 if high<divisor
+	movl	PARAM_SRC, %esi		C reload
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
+
+	movl	%ebx, VAR_DST_STOP
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ecx, %ebx		C size
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	%edx, %edi		C carry
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	movl	$-1, %edx
+
+	shll	%cl, %ebp		C d normalized
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+	C eax	inverse
+	C ebx	size
+	C ecx	shift
+	C edx
+	C esi	src
+	C edi	carry
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	movl	%eax, VAR_INVERSE
+	orl	%ebx, %ebx		C size
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	%eax, VAR_SRC
+	jz	L(start_zero)
+
+	movl	8(%eax), %esi		C src high limb
+	cmpl	$1, %ebx
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+	C skipped a division.
+
+	shll	%cl, %edi		C n2 = carry << l
+	movl	%edi, %eax		C return value for zero_done
+	cmpl	$0, PARAM_XSIZE
+
+	je	L(zero_done)
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest.  A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		n2+n1	1   (addl)
+C		mul	5
+C		q1+1	3   (addl/adcl)
+C		mul	5
+C		sub	3   (subl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	d
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+	movq	(%ecx), %mm0       C next src limb and the one below it
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	movl	%ebp, %eax	   C d
+	leal	1(%edi), %ebx      C n2+1
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	VAR_DST, %ecx
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+	subl	$4, %ecx
+
+	movl	%ebx, (%ecx)
+	cmpl	%eax, %ecx
+
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	PARAM_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_DST_STOP, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	PARAM_XSIZE, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+	orl	%eax, %eax         C xsize
+	jnz	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+L(zero_done):
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_DST
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+
+	movl	$-1, (%ecx)
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		mul	5
+C		q1+1	1   (addl)
+C		mul	5
+C		sub	3   (negl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       16
+C
+C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
+	movl	%edi, %eax
+
+	subl	$8, %ecx		C &dst[xsize]
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	n2
+	C ebp	divisor
+
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal	(%ebp,%eax), %edx
+
+	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	C q
+	movl	%eax, %edi	C remainder->n2
+	cmpl	%esi, %ecx
+
+	movl	%ebx, (%ecx)	C previous q
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/mmx/gmp-mparam.h b/third_party/gmp/mpn/x86/p6/mmx/gmp-mparam.h
new file mode 100644
index 0000000..ef29061
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/gmp-mparam.h

@@ -0,0 +1,218 @@
+/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+   value in mpn/x86/p6/gmp-mparam.h.  The latter is used as a hard limit in
+   mpn/x86/p6/sqr_basecase.asm.  */
+
+
+/* 800 MHz P6 model 8 */
+/* Generated by tuneup.c, 2017-02-03, gcc 4.8 */
+
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        30
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     14
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           62
+
+#define DIV_1_VS_MUL_1_PERCENT             168
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               254
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     100
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30	/* WRONG value, see comment above */
+#define SQR_TOOM3_THRESHOLD                 83
+#define SQR_TOOM4_THRESHOLD                196
+#define SQR_TOOM6_THRESHOLD                214
+#define SQR_TOOM8_THRESHOLD                381
+
+#define MULMID_TOOM42_THRESHOLD             56
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    167,10}, {     95, 9}, {    199,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    159,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543, 8}, {   1087,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415, 9}, {    831,11}, \
+    {    223,10}, {    447,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    703,10}, {   1407,11}, {    735,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    959,10}, {   1919,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2559,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3327,13}, {   1919,12}, {   3839,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD                 7040
+
+#define SQR_FFT_MODF_THRESHOLD             376  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    376, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    167,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511, 9}, \
+    {    271,10}, {    143, 9}, {    287, 8}, {    575, 9}, \
+    {    303, 8}, {    607,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    479,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    703,10}, \
+    {   1407,11}, {    735,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,10}, {   1919,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,11}, {   3839,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3839,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 177
+#define SQRLO_SQR_THRESHOLD               8937
+
+#define DC_DIV_QR_THRESHOLD                 80
+#define DC_DIVAPPR_Q_THRESHOLD             240
+#define DC_BDIV_QR_THRESHOLD                76
+#define DC_BDIV_Q_THRESHOLD                166
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               262
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              272
+#define REDC_1_TO_REDC_N_THRESHOLD          72
+
+#define MU_DIV_QR_THRESHOLD               1499
+#define MU_DIVAPPR_Q_THRESHOLD            1470
+#define MUPI_DIV_QR_THRESHOLD              124
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1341
+
+#define POWM_SEC_TABLE  1,16,96,416,1259
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD      1084
+
+#define FAC_DSC_THRESHOLD                  194
+#define FAC_ODD_THRESHOLD                   25
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     124
+#define HGCD_APPR_THRESHOLD                152
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   474
+#define GCDEXT_DC_THRESHOLD                321
+#define JACOBI_BASE_METHOD                   1

diff --git a/third_party/gmp/mpn/x86/p6/mmx/lshift.asm b/third_party/gmp/mpn/x86/p6/mmx/lshift.asm
new file mode 100644
index 0000000..febd1c0
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/lshift.asm

@@ -0,0 +1,38 @@
+dnl  Intel Pentium-II mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl  at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')

diff --git a/third_party/gmp/mpn/x86/p6/mmx/popham.asm b/third_party/gmp/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000..fd340e4
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/popham.asm

@@ -0,0 +1,39 @@
+dnl  Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb (approx)
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/p6/mmx/rshift.asm b/third_party/gmp/mpn/x86/p6/mmx/rshift.asm
new file mode 100644
index 0000000..77aa190
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/rshift.asm

@@ -0,0 +1,38 @@
+dnl  Intel Pentium-II mpn_rshift -- mpn left shift.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl  at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')

diff --git a/third_party/gmp/mpn/x86/p6/mod_34lsub1.asm b/third_party/gmp/mpn/x86/p6/mod_34lsub1.asm
new file mode 100644
index 0000000..b88ab5d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mod_34lsub1.asm

@@ -0,0 +1,190 @@
+dnl  Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 2.0 cycles/limb
+
+C TODO
+C  Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13
+C  with the current carry handling scheme.
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3
+C into 2mod3, but at that point going into a separate carries total so we
+C don't keep the carry flag live across the loop control.  Avoiding decl
+C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66.
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SIZE')
+define(SAVE_ESI, `PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx		C size-2
+	movl	(%edx), %eax		C src[0]
+	ja	L(three_or_more)
+	jb	L(one)
+
+	C size==2
+
+	movl	4(%edx), %ecx		C src[1]
+
+	movl	%eax, %edx		C src[0]
+	shrl	$24, %eax		C src[0] high
+
+	andl	$0xFFFFFF, %edx		C src[0] low
+
+	addl	%edx, %eax
+	movl	%ecx, %edx		C src[1]
+	shrl	$16, %ecx		C src[1] high
+
+	andl	$0xFFFF, %edx
+	addl	%ecx, %eax
+
+	shll	$8, %edx		C src[1] low
+
+	addl	%edx, %eax
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax	src[0], initial acc 0mod3
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+	C ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	4(%edx), %ebx		C src[1], initial 1mod3
+	subl	$3, %ecx		C size-5
+
+	movl	%esi, SAVE_ESI
+	movl	8(%edx), %esi		C src[2], initial 2mod3
+
+	pushl	%edi	FRAME_pushl()
+	movl	$0, %edi		C initial carries 0mod3
+	jng	L(done)			C if size < 6
+
+
+L(top):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx	counter, limbs
+	C edx	src
+	C esi	acc 2mod3
+	C edi	carrys into 0mod3
+	C ebp
+
+	addl	12(%edx), %eax
+	adcl	16(%edx), %ebx
+	adcl	20(%edx), %esi
+	leal	12(%edx), %edx
+	adcl	$0, %edi
+
+	subl	$3, %ecx
+	jg	L(top)			C at least 3 more to process
+
+
+L(done):
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively
+	cmpl	$-1, %ecx
+	jl	L(done_0)		C if -2, meaning 0 more limbs
+
+	C 1 or 2 more limbs
+	movl	$0, %ecx
+	je	L(done_1)		C if -1, meaning 1 more limb only
+	movl	16(%edx), %ecx
+L(done_1):
+	addl	12(%edx), %eax		C 0mod3
+	adcl	%ecx, %ebx		C 1mod3
+	adcl	$0, %esi		C 2mod3
+	adcl	$0, %edi		C carries 0mod3
+
+L(done_0):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx
+	C edx
+	C esi	acc 2mod3
+	C edi	carries 0mod3
+	C ebp
+
+	movl	%eax, %ecx		C 0mod3
+	shrl	$24, %eax		C 0mod3 high initial total
+
+	andl	$0xFFFFFF, %ecx		C 0mod3 low
+	movl	%edi, %edx		C carries
+	shrl	$24, %edi		C carries high
+
+	addl	%ecx, %eax		C add 0mod3 low
+	andl	$0xFFFFFF, %edx		C carries 0mod3 low
+	movl	%ebx, %ecx		C 1mod3
+
+	shrl	$16, %ebx		C 1mod3 high
+	addl	%edi, %eax		C add carries high
+	addl	%edx, %eax		C add carries 0mod3 low
+
+	andl	$0xFFFF, %ecx		C 1mod3 low mask
+	addl	%ebx, %eax		C add 1mod3 high
+	movl	SAVE_EBX, %ebx
+
+	shll	$8, %ecx		C 1mod3 low
+	movl	%esi, %edx		C 2mod3
+	popl	%edi	FRAME_popl()
+
+	shrl	$8, %esi		C 2mod3 high
+	andl	$0xFF, %edx		C 2mod3 low mask
+	addl	%ecx, %eax		C add 1mod3 low
+
+	shll	$16, %edx		C 2mod3 low
+	addl	%esi, %eax		C add 2mod3 high
+	movl	SAVE_ESI, %esi
+
+	addl	%edx, %eax		C add 2mod3 low
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/mode1o.asm b/third_party/gmp/mpn/x86/p6/mode1o.asm
new file mode 100644
index 0000000..7083195
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mode1o.asm

@@ -0,0 +1,170 @@
+dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 10.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C It's not worth skipping a step at the end when high<divisor since the main
+C loop is only 10 cycles.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+dnl  Not enough room under modexact_1 to make these re-use the parameter
+dnl  space, unfortunately.
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+deflit(STACK_SPACE, 12)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+L(start_1c):
+	movl	PARAM_DIVISOR, %eax
+
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	shrl	%eax			C d/2
+	movl	%edi, SAVE_EDI
+
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edi)
+	movzbl	(%eax,%edi), %edi		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
+')
+
+	xorl	%edx, %edx		C initial extra carry
+	leal	(%edi,%edi), %eax	C 2*inv
+
+	imull	%edi, %edi		C inv*inv
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	imull	PARAM_DIVISOR, %edi	C inv*inv*d
+
+	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %edi	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	negl	%ebx			C -size
+
+	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	PARAM_DIVISOR, %eax
+	imull	%edi, %eax
+	cmpl	$1, %eax')
+
+
+C The dependent chain here is
+C
+C	subl	%edx, %eax       1
+C	imull	%edi, %eax       4
+C	mull	PARAM_DIVISOR    5
+C			       ----
+C	total			10
+C
+C and this is the measured speed.  No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(top):
+	C eax	scratch (src limb)
+	C ebx	counter, limbs, negative
+	C ecx	carry bit, 0 or 1
+	C edx	carry limb, high of last product
+	C esi	&src[size]
+	C edi	inverse
+	C ebp
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ecx, %eax
+
+	sbbl	%ecx, %ecx
+	subl	%edx, %eax
+
+	sbbl	$0, %ecx
+
+	imull	%edi, %eax
+
+	negl	%ecx
+
+	mull	PARAM_DIVISOR
+
+	incl	%ebx
+	jnz	L(top)
+
+
+	movl	SAVE_ESI, %esi
+	leal	(%ecx,%edx), %eax
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/p6/mul_basecase.asm b/third_party/gmp/mpn/x86/p6/mul_basecase.asm
new file mode 100644
index 0000000..d87bc12
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mul_basecase.asm

@@ -0,0 +1,607 @@
+dnl  Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
+
+
+dnl  P6 UNROLL_COUNT cycles/product (approx)
+dnl           8           7
+dnl          16           6.5
+dnl          32           6.4
+dnl  Maximum possible with the current code is 32.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+
+	movl	(%eax), %eax		C yp[0]
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	mull	(%edx)
+
+	movl	PARAM_WP, %ecx
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_XSIZE')
+define(SAVE_ESI, `PARAM_YSIZE')
+
+	movl	%ebx, SAVE_EBX
+	cmpl	$1, PARAM_YSIZE
+	movl	%eax, %ecx		C yp[0]
+
+	movl	%esi, SAVE_ESI		C save esi
+	movl	PARAM_WP, %ebx
+	movl	%edx, %esi		C xp
+
+	movl	(%edx), %eax		C xp[0]
+	jne	L(two_by_two)
+
+
+	C two limbs by one limb
+	C
+	C eax	xp[0]
+	C ebx	wp
+	C ecx	yp[0]
+	C edx
+	C esi	xp
+
+	mull	%ecx
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+	movl	%edx, %esi		C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+
+	movl	%esi, 4(%ebx)
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	movl	SAVE_EBX, %ebx
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp[0]
+	C ebx	wp
+	C ecx	yp[0]
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+
+dnl  more parameter space re-use
+define(SAVE_EDI, `PARAM_WP')
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	movl	%edi, SAVE_EDI
+	movl	%edx, %edi	C carry, for wp[1]
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+	movl	4(%ecx), %ecx	C yp[1]
+
+	movl	%edi, 4(%ebx)
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+	movl	(%esi), %eax	C xp[0]
+
+	adcl	$0, %edx
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	movl	%esi, %eax
+
+	adcl	%edx, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	%edi, 8(%ebx)
+
+	adcl	$0, %eax
+	movl	SAVE_EDI, %edi
+
+	movl	%eax, 12(%ebx)
+	movl	SAVE_EBX, %ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 loop running at
+C about 6.2 c/l.  Unrolling this doesn't seem worthwhile since it's only run
+C once (whereas the addmul_1 below is run ysize-1 many times).  A call to
+C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
+C and doesn't seem likely to be worthwhile on the typical sizes reaching
+C here from the Karatsuba code.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+
+defframe(`SAVE_EBX',    -4)
+defframe(`SAVE_ESI',    -8)
+defframe(`SAVE_EDI',   -12)
+defframe(`SAVE_EBP',   -16)
+defframe(VAR_COUNTER,  -20)  dnl for use in the unroll case
+defframe(VAR_ADJUST,   -24)
+defframe(VAR_JMP,      -28)
+defframe(VAR_SWAP,     -32)
+defframe(VAR_XP_LOW,   -36)
+deflit(STACK_SPACE, 36)
+
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_WP, %edi
+
+	movl	%ebx, SAVE_EBX
+
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+
+	movl	%esi, SAVE_ESI
+	xorl	%ebx, %ebx
+	leal	(%edx,%ecx,4), %esi	C xp end
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+
+	movl	%ebx, (%edi)		C final carry
+	movl	PARAM_XSIZE, %ecx
+	decl	%edx
+
+	jz	L(done)			C if ysize==1
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+	C simple addmul looping
+	C
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	leal	4(%eax,%edx,4), %ebp	C yp end
+	negl	%ecx
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE	C -(ysize-1)
+	movl	(%esi,%ecx,4), %eax	C xp low limb
+	incl	%ecx
+
+	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
+	xorl	%ebx, %ebx		C initial carry
+
+	movl	%ebp, PARAM_YP
+	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
+	jmp	L(simple_outer_entry)
+
+
+L(simple_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	PARAM_YP, %edx
+
+	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
+	xorl	%ebx, %ebx		C carry
+
+	movl	%ebp, PARAM_YSIZE
+	addl	$4, %edi		C next position in wp
+
+	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
+
+	movl	-4(%esi,%ecx,4), %eax	C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner_top):
+	C eax	xp limb
+	C ebx	carry limb
+	C ecx	loop counter (negative)
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ecx
+	movl	%edx, %ebx
+	jnz	L(simple_inner_top)
+
+
+	C separate code for last limb so outer loop counter handling can be
+	C interleaved
+
+	mull	%ebp
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, 4(%edi)
+	jnz	L(simple_outer_top)
+
+
+L(done):
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1, see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp.  This is used to adjust xp and wp, and is rshifted to
+C given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
+C initial ebx and ecx on entry to the unrolling.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with the VAR_ADJUST value means it's only necessary to do one
+C fetch in the outer loop to take care of xp, wp and the inner loop counter.
+
+
+L(unroll):
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	movl	PARAM_XP, %esi
+
+	movl	4(%eax), %ebp		C multiplier (yp second limb)
+	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
+
+	movl	%eax, PARAM_YP
+	movl	PARAM_WP, %edi
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
+	decl	%ecx				C xsize-1
+
+	movl	(%esi), %eax		C xp low limb
+	andl	$-UNROLL_MASK-1, %ebx
+	negl	%ecx			C -(xsize-1)
+
+	negl	%ebx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ebx, VAR_ADJUST
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+	movl	%eax, VAR_XP_LOW
+	sarl	$UNROLL_LOG2, %ebx
+	negl	%edx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_inner_entry) (%ecx,%edx,1), %ecx
+')
+
+	movl	%ecx, VAR_JMP
+	movl	%edx, %ecx
+	shll	$31, %edx
+
+	sarl	$31, %edx		C 0 or -1 as xsize odd or even
+	leal	4(%edi,%ecx,4), %edi	C wp and xp, adjust for unrolling,
+	leal	4(%esi,%ecx,4), %esi	C  and start at second limb
+
+	movl	%edx, VAR_SWAP
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%ecx,%edx,1), %ecx
+	addl	$L(unroll_inner_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi	xp + offset
+	C edi	wp + offset
+	C ebp	ysize counter, negative
+
+	movl	VAR_ADJUST, %ebx
+	movl	PARAM_YP, %edx
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
+
+	leal	eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
+	leal	(%esi,%ebx,4), %esi
+	sarl	$UNROLL_LOG2, %ebx
+
+	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	movl	%ebx, VAR_COUNTER
+	movl	%edx, %ebx		C carry high
+	movl	%eax, %ecx		C carry low
+
+	xorl	%edx, %eax
+	movl	VAR_JMP, %edx
+
+	andl	VAR_SWAP, %eax
+
+	xorl	%eax, %ebx		C carries other way for odd index
+	xorl	%eax, %ecx
+
+	jmp	*%edx
+
+
+C -----------------------------------------------------------------------------
+
+L(unroll_inner_top):
+	C eax	xp limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	xp+8
+	C edi	wp
+	C ebp	yp multiplier limb
+	C
+	C VAR_COUNTER  loop counter, negative
+	C
+	C 15 bytes each limb
+
+	addl	$UNROLL_BYTES, %edi
+
+L(unroll_inner_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	adcl	%eax, %ebx		C new carry low
+	movl	%edx, %ecx
+	adcl	$0, %ecx		C new carry high
+
+	movl	disp1(%esi), %eax
+	mull	%ebp
+	addl	%ebx, disp1(%edi)
+	adcl	%eax, %ecx		C new carry low
+	movl	%edx, %ebx
+	adcl	$0, %ebx		C new carry high
+')
+
+
+	incl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	jnz	L(unroll_inner_top)
+
+
+	C eax
+	C ebx	carry high
+	C ecx	carry low
+	C edx
+	C esi
+	C edi	wp, pointing at second last limb)
+	C ebp
+
+deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+deflit(`disp1', eval(disp0 + 4))
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%ecx, disp0(%edi)	C carry low
+
+	adcl	$0, %ebx
+	incl	%ebp
+
+	movl	%ebx, disp1(%edi)	C carry high
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/p3mmx/popham.asm b/third_party/gmp/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000..db2f260
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/p3mmx/popham.asm

@@ -0,0 +1,42 @@
+dnl  Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			     popcount	     hamdist
+C P3 generic			6.5		7
+C P3 model 9  (Banias)		?		?
+C P3 model 13 (Dothan)		5.75		6
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sqr_basecase.asm b/third_party/gmp/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000..8fc7fdf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sqr_basecase.asm

@@ -0,0 +1,649 @@
+dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+C     product (measured on the speed difference between 20 and 40 limbs,
+C     which is the Karatsuba recursing range).
+
+
+dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl  a description.  The only difference here is that UNROLL_COUNT can go up
+dnl  to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src limb
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	%eax, %esi
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%esi), %eax
+
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%edi, SAVE_EDI
+	movl	%eax, %edi	C dst[2]
+	movl	(%esi), %eax
+
+	movl	%ebp, SAVE_EBP
+	movl	%edx, %ebp	C dst[3]
+
+	mull	4(%esi)		C src[0]*src[1]
+
+	addl	%eax, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edi, %edx
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %ebp
+
+	movl	%eax, 4(%ecx)
+
+	movl	%ebp, 12(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 8(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+deflit(`FRAME',0)
+
+	pushl	%esi	defframe_pushl(`SAVE_ESI')
+	cmpl	$4, %edx
+
+	movl	PARAM_SRC, %esi
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%ebp	defframe_pushl(`SAVE_EBP')
+	pushl	%edi	defframe_pushl(`SAVE_EDI')
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%esi), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+	movl	8(%esi), %eax
+
+	pushl	%ebx	defframe_pushl(`SAVE_EBX')
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%esi), %eax
+
+	mull	4(%esi)		C src[0] * src[1]
+
+	movl	%eax, %ebx
+	movl	%edx, %edi
+
+	movl	(%esi), %eax
+
+	mull	8(%esi)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%esi), %eax
+
+	mull	8(%esi)		C src[1] * src[2]
+
+	xorl	%esi, %esi
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	dst[1]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	zero, will be dst[5]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%ebx, %ebx
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %esi
+	addl	%ebx, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %ebx
+
+	adcl	%ebp, %ebx
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%ebx, 12(%ecx)
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edx, %edi
+	movl	20(%ecx), %eax
+
+	movl	%edi, 16(%ecx)
+	movl	SAVE_EDI, %edi
+
+	adcl	%esi, %eax	C no carry out of this
+	movl	SAVE_ESI, %esi
+
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx
+	C edx	size
+	C esi	src
+	C edi
+	C ebp
+deflit(`FRAME',4)  dnl  %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+	subl	$STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+	movl	$1, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	subl	%edx, %ecx		C -(size-1)
+
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebx		C initial carry
+
+	leal	(%esi,%edx,4), %esi	C &src[size]
+	movl	%eax, %ebp		C multiplier
+
+	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative, -(size-1) to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp	multiplier
+
+	movl	%ebp, %eax
+
+	mull	(%esi,%ecx,4)
+
+	addl	%ebx, %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	movl	%eax, 4(%edi,%ecx,4)
+
+	incl	%ecx
+	jnz	L(mul_1)
+
+
+	movl	%ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl  This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl  that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+	negl	%ecx
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limit that UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+
+	ASSERT(ae,
+	`movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, second highest limb of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+
+	movl	%edx, %ebx	C high carry
+	leal	4(%edi), %edi
+
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+	movl	%edx, VAR_JMP
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/ebx should start.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src pointer
+	C edi	dst pointer
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ebx, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+')
+')
+L(unroll_inner_end):
+
+	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+	movl	VAR_COUNTER, %edx
+	adcl	$0, %ecx
+
+	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+	C ebp
+
+	movl	-12(%esi), %eax
+
+	mull	-8(%esi)
+
+	addl	%eax, (%edi)
+	movl	-12(%esi), %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	addl	%eax, %ebx
+	movl	-8(%esi), %eax
+
+	adcl	$0, %edx
+
+	addl	%ebx, 4(%edi)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	movl	PARAM_SIZE, %ecx
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+
+	movl	%eax, 8(%edi)
+
+	movl	%edx, 12(%edi)
+	movl	PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+	subl	$1, %ecx		C size-1
+	xorl	%eax, %eax		C ready for final adcl, and clear carry
+
+	movl	%ecx, %edx
+	movl	PARAM_SRC, %esi
+
+
+L(lshift):
+	C eax
+	C ebx
+	C ecx	counter, size-1 to 1
+	C edx	size-1 (for later use)
+	C esi	src (for later use)
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+
+	leal	8(%edi), %edi
+	decl	%ecx
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%esi), %eax		C src[0]
+
+	leal	4(%esi,%edx,4), %esi	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	&src[size]
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret_internal
+')
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/sse2/addmul_1.asm b/third_party/gmp/mpn/x86/p6/sse2/addmul_1.asm
new file mode 100644
index 0000000..144b627
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/addmul_1.asm

@@ -0,0 +1,37 @@
+dnl  Intel P6/SSE2 mpn_addmul_1.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Write P6 specific SSE2 code.
+
+MULFUNC_PROLOGUE(mpn_addmul_1)
+include_mpn(`x86/pentium4/sse2/addmul_1.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/gmp-mparam.h b/third_party/gmp/mpn/x86/p6/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a1e261b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/gmp-mparam.h

@@ -0,0 +1,200 @@
+/* Intel P6/sse2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+   value in mpn/x86/p6/gmp-mparam.h.  The latter is used as a hard limit in
+   mpn/x86/p6/sqr_basecase.asm.  */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD                 4
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                77
+#define MUL_TOOM44_THRESHOLD               169
+#define MUL_TOOM6H_THRESHOLD               246
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     106
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             690  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    565, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 5}, \
+    {    383, 4}, {    991, 5}, {    511, 6}, {    267, 7}, \
+    {    157, 8}, {     91, 9}, {     47, 8}, {    111, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1407,12}, {   2815,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    472, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     63, 4}, {   1023, 8}, {     67, 9}, \
+    {     39, 5}, {    639, 4}, {   1471, 6}, {    383, 7}, \
+    {    209, 8}, {    119, 9}, {     63, 7}, {    255, 8}, \
+    {    139, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159, 8}, {    319, 9}, \
+    {    167,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671, 9}, {   1343,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  31
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 100
+#define SQRLO_SQR_THRESHOLD               9236
+
+#define DC_DIV_QR_THRESHOLD                 25
+#define DC_DIVAPPR_Q_THRESHOLD              55
+#define DC_BDIV_QR_THRESHOLD                60
+#define DC_BDIV_Q_THRESHOLD                132
+
+#define INV_MULMOD_BNM1_THRESHOLD           38
+#define INV_NEWTON_THRESHOLD                65
+#define INV_APPR_THRESHOLD                  65
+
+#define BINV_NEWTON_THRESHOLD              252
+#define REDC_1_TO_REDC_N_THRESHOLD          62
+
+#define MU_DIV_QR_THRESHOLD               1164
+#define MU_DIVAPPR_Q_THRESHOLD             748
+#define MUPI_DIV_QR_THRESHOLD               38
+#define MU_BDIV_QR_THRESHOLD              1360
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  2,23,258,879,2246
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD               582
+#define SET_STR_PRECOMPUTE_THRESHOLD      1118
+
+#define FAC_DSC_THRESHOLD                  178
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD_THRESHOLD                      69
+#define HGCD_APPR_THRESHOLD                112
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   386
+#define GCDEXT_DC_THRESHOLD                303
+#define JACOBI_BASE_METHOD                   1

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mod_1_1.asm b/third_party/gmp/mpn/x86/p6/sse2/mod_1_1.asm
new file mode 100644
index 0000000..8b7b7ad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mod_1_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel P6/SSE2 mpn_mod_1_1.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mod_1_4.asm b/third_party/gmp/mpn/x86/p6/sse2/mod_1_4.asm
new file mode 100644
index 0000000..49c96c6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mod_1_4.asm

@@ -0,0 +1,34 @@
+dnl  Intel P6/SSE2 mpn_mod_1_4.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mul_1.asm b/third_party/gmp/mpn/x86/p6/sse2/mul_1.asm
new file mode 100644
index 0000000..50e5b69
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mul_1.asm

@@ -0,0 +1,38 @@
+dnl  Intel P6/SSE2 mpn_mul_1.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Write P6 specific SSE2 code.  It should reach 3 c/l.
+C    The Pentium4 code runs at 4.2 c/l.
+
+MULFUNC_PROLOGUE(mpn_mul_1)
+include_mpn(`x86/pentium4/sse2/mul_1.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mul_basecase.asm b/third_party/gmp/mpn/x86/p6/sse2/mul_basecase.asm
new file mode 100644
index 0000000..4687625
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mul_basecase.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_mul_basecase.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86/pentium4/sse2/mul_basecase.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/popcount.asm b/third_party/gmp/mpn/x86/p6/sse2/popcount.asm
new file mode 100644
index 0000000..4c02b93
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/popcount.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_popcount -- population count.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/sqr_basecase.asm b/third_party/gmp/mpn/x86/p6/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..76b574b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/sqr_basecase.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_sqr_basecase.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86/pentium4/sse2/sqr_basecase.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/submul_1.asm b/third_party/gmp/mpn/x86/p6/sse2/submul_1.asm
new file mode 100644
index 0000000..69d940d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/submul_1.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_submul_1.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_submul_1)
+include_mpn(`x86/pentium4/sse2/submul_1.asm')

diff --git a/third_party/gmp/mpn/x86/pentium/README b/third_party/gmp/mpn/x86/pentium/README
new file mode 100644
index 0000000..305936b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/README

@@ -0,0 +1,181 @@
+Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                   INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors.  The mmx subdirectory has additional code for Pentium with MMX
+(P55).
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            2.375
+
+	mpn_mul_1                 12.0
+	mpn_add/submul_1          14.0
+
+	mpn_mul_basecase          14.2 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           8 cycles/crossproduct (approx)
+                                   or 15.5 cycles/triangleproduct (approx)
+
+	mpn_l/rshift               5.375 normal (6.0 on P54)
+				   1.875 special shift by 1 bit
+
+	mpn_divrem_1              44.0
+	mpn_mod_1                 28.0
+	mpn_divexact_by3          15.0
+
+	mpn_copyi/copyd            1.0
+
+Pentium MMX gets the following improvements
+
+	mpn_l/rshift               1.75
+
+	mpn_mul_1                 12.0 normal, 7.0 for 16-bit multiplier
+
+
+mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb.  Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5
+cycles/limb.
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should.  Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run as 9.
+
+
+
+P55 MMX AND X87
+
+The cost of switching between MMX and x87 floating point on P55 is about 100
+cycles (fld1/por/emms for instance).  In order to avoid that the two aren't
+mixed and currently that means using MMX and not x87.
+
+MMX offers a big speedup for lshift and rshift, and a nice speedup for
+16-bit multipliers in mpn_mul_1.  If fast code using x87 is found then
+perhaps the preference for MMX will be reversed.
+
+
+
+
+P54 SHLDL
+
+mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically.  The P55 runs them at the expected speed.
+
+It seems that on P54 a shldl or shrdl allows pairing in one following cycle,
+but not two.  For example, back to back repetitions of the following
+
+	shldl(	%cl, %eax, %ebx)
+	xorl	%edx, %edx
+	xorl	%esi, %esi
+
+run at 5 cycles, as expected, but repetitions of the following run at 7
+cycles, whereas 6 would be expected (and is achieved on P55),
+
+	shldl(	%cl, %eax, %ebx)
+	xorl	%edx, %edx
+	xorl	%esi, %esi
+	xorl	%edi, %edi
+	xorl	%ebp, %ebp
+
+Three xorls run at 7 cycles too, so it doesn't seem to be just that pairing
+inhibited is only in the second following cycle (or something like that).
+
+Avoiding this problem would bring P54 shifts down from 6.0 c/l to 5.5 with a
+pattern of shift, 2 loads, shift, 2 stores, shift, etc.  A start has been
+made on something like that, but it's not yet complete.
+
+
+
+
+OTHER NOTES
+
+Prefetching Destinations
+
+    Pentium doesn't allocate cache lines on writes, unlike most other modern
+    processors.  Since the functions in the mpn class do array writes, we
+    have to handle allocating the destination cache lines by reading a word
+    from it in the loops, to achieve the best performance.
+
+Prefetching Sources
+
+    Prefetching of sources is pointless since there's no out-of-order loads.
+    Any load instruction blocks until the line is brought to L1, so it may
+    as well be the load that wants the data which blocks.
+
+Data Cache Bank Clashes
+
+    Pairing of memory operations requires that the two issued operations
+    refer to different cache banks (ie. different addresses modulo 32
+    bytes).  The simplest way to ensure this is to read/write two words from
+    the same object.  If we make operations on different objects, they might
+    or might not be to the same cache bank.
+
+PIC %eip Fetching
+
+    A simple call $+5 and popl can be used to get %eip, there's no need to
+    balance calls and returns since P5 doesn't have any return stack branch
+    prediction.
+
+Float Multiplies
+
+    fmul is pairable and can be issued every 2 cycles (with a 4 cycle
+    latency for data ready to use).  This is a lot better than integer mull
+    or imull at 9 cycles non-pairing.  Unfortunately the advantage is
+    quickly eaten away by needing to throw data through memory back to the
+    integer registers to adjust for fild and fist being signed, and to do
+    things like propagating carry bits.
+
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is mostly about P5, the parts about P6 aren't relevant.  Available on-line:
+
+        http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/pentium/aors_n.asm b/third_party/gmp/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000..01ebfb9
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/aors_n.asm

@@ -0,0 +1,203 @@
+dnl  Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 2.375 cycles/limb
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(endgo)
+
+	pushl	%edx
+FRAME_pushl()
+	movl	PARAM_CARRY,%eax
+	shrl	%eax			C shift bit 0 into carry
+	jmp	L(oop)
+
+L(endgo):
+deflit(`FRAME',16)
+	movl	PARAM_CARRY,%eax
+	shrl	%eax			C shift bit 0 into carry
+	jmp	L(end)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(end)
+	pushl	%edx
+FRAME_pushl()
+
+	ALIGN(8)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	12(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	20(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	28(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+FRAME_popl()
+L(end):
+	decl	%edx			C test %edx w/o clobbering carry
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/aorsmul_1.asm b/third_party/gmp/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000..d83cc45
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/aorsmul_1.asm

@@ -0,0 +1,144 @@
+dnl  Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+ifdef(`OPERATION_addmul_1', `
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+      define(M4_function_1c, mpn_addmul_1c)
+
+',`ifdef(`OPERATION_submul_1', `
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+      define(M4_function_1c, mpn_submul_1c)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+
+	ALIGN(8)
+PROLOGUE(M4_function_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+	pushl	%esi		FRAME_pushl()
+
+L(start_1c):
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %eax
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	leal	-1(%eax), %ebx		C size-1
+
+	leal	(%esi,%eax,4), %esi
+	xorl	$-1, %ebx		C -size, and clear carry
+
+	leal	(%edi,%eax,4), %edi
+
+L(top):
+	C eax
+	C ebx	counter, negative
+	C ecx	carry
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp
+
+	adcl	$0, %ecx
+	movl	(%esi,%ebx,4), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%ecx, %eax
+	movl	(%edi,%ebx,4), %ecx
+
+	adcl	$0, %edx
+	M4_inst	%eax, %ecx
+
+	movl	%ecx, (%edi,%ebx,4)
+	incl	%ebx
+
+	movl	%edx, %ecx
+	jnz	L(top)
+
+
+	adcl	$0, %ecx
+	popl	%ebx
+
+	movl	%ecx, %eax
+	popl	%edi
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/bdiv_q_1.asm b/third_party/gmp/mpn/x86/pentium/bdiv_q_1.asm
new file mode 100644
index 0000000..c2c4f58
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/bdiv_q_1.asm

@@ -0,0 +1,266 @@
+dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C P54:  24.5  30.5   cycles/limb
+C P55:  23.0  28.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected.  On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	$-1, %ecx
+	movl	PARAM_DIVISOR, %eax
+
+L(strip_twos):
+	ASSERT(nz, `orl %eax, %eax')
+	shrl	%eax
+	incl	%ecx			C shift count
+
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %edx	C d
+	andl	$127, %eax		C d/2, 7 bits
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %eax
+',`
+	call	L(here)
+L(here):
+	popl	%ebp			C eip
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+	C AGI
+	movl	binvert_limb_table@GOT(%ebp), %ebp
+	C AGI
+	movzbl	(%eax,%ebp), %eax
+')
+',`
+
+dnl non-PIC
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	movl	%eax, %ebp		C inv
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	PARAM_SIZE, %ebx
+
+	movl	%eax, %ebp
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	%edx, PARAM_DIVISOR	C d without twos
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SHIFT, %ecx
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebx
+	movl	PARAM_INVERSE, %eax
+
+L(common):
+	pushl	%esi		FRAME_pushl()
+	push	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+	movl	%eax, VAR_INVERSE
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	leal	(%edi,%ebx,4), %edi	C dst end
+
+	negl	%ebx			C -size
+
+	xorl	%ebp, %ebp		C initial carry bit
+
+	orl	%ecx, %ecx		C shift
+	movl	(%esi,%ebx,4), %eax	C src low limb
+	jz	L(odd_entry)
+
+	xorl	%edx, %edx		C initial carry limb (for even, if one)
+	incl	%ebx
+	jz	L(one)
+
+	movl	(%esi,%ebx,4), %edx	C src second limb (for even)
+	shrdl(	%cl, %edx, %eax)
+
+	jmp	L(even_entry)
+
+
+	ALIGN(8)
+L(odd_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ebp, %edx
+
+	subl	%edx, %eax
+
+	sbbl	%ebp, %ebp
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	twos
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	subl	%ebp, %edx		C carry bit
+	movl	-4(%esi,%ebx,4), %eax	C src limb
+
+	movl	(%esi,%ebx,4), %ebp	C and one above it
+
+	shrdl(	%cl, %ebp, %eax)
+
+	subl	%edx, %eax		C carry limb
+
+	sbbl	%ebp, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+
+	jnz	L(even_top)
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax		C src high limb
+	subl	%ebp, %edx
+
+L(one):
+	shrl	%cl, %eax
+
+	subl	%edx, %eax		C no carry if division is exact
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)		C dst high limb
+	nop				C protect against cache bank clash
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/com.asm b/third_party/gmp/mpn/x86/pentium/com.asm
new file mode 100644
index 0000000..b080545
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/com.asm

@@ -0,0 +1,181 @@
+dnl  Intel Pentium mpn_com -- mpn ones complement.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb
+
+
+NAILS_SUPPORT(0-31)
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C This code is similar to mpn_copyi, basically there's just some "xorl
+C $GMP_NUMB_MASK"s inserted.
+C
+C Alternatives:
+C
+C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
+C are the same alignment mod 8, but it doesn't seem worth the trouble for
+C just that case (there'd need to be some plain integer available too for
+C the unaligned case).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	leal	(%eax,%ecx,4), %eax
+	xorl	$-1, %ecx		C -size-1
+
+	movl	PARAM_DST, %edx
+	addl	$8, %ecx		C -size+7
+
+	jns	L(end)
+
+	movl	(%edx), %esi		C fetch destination cache line
+	nop
+
+L(top):
+	C eax	&src[size]
+	C ebx
+	C ecx	counter, limbs, negative
+	C edx	dst, incrementing
+	C esi	scratch
+	C edi	scratch
+	C ebp
+
+	movl	28(%edx), %esi		C destination prefetch
+	addl	$32, %edx
+
+	movl	-28(%eax,%ecx,4), %esi
+	movl	-24(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -32(%edx)
+	movl	%edi, -28(%edx)
+
+	movl	-20(%eax,%ecx,4), %esi
+	movl	-16(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -24(%edx)
+	movl	%edi, -20(%edx)
+
+	movl	-12(%eax,%ecx,4), %esi
+	movl	-8(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -16(%edx)
+	movl	%edi, -12(%edx)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -8(%edx)
+	movl	%edi, -4(%edx)
+
+	addl	$8, %ecx
+	js	L(top)
+
+
+L(end):
+	C eax	&src[size]
+	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
+	C edx	dst, next location to store
+
+	subl	$4, %ecx
+	nop
+
+	jns	L(no4)
+
+	movl	-12(%eax,%ecx,4), %esi
+	movl	-8(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, (%edx)
+	movl	%edi, 4(%edx)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, 8(%edx)
+	movl	%edi, 12(%edx)
+
+	addl	$16, %edx
+	addl	$4, %ecx
+L(no4):
+
+	subl	$2, %ecx
+	nop
+
+	jns	L(no2)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, (%edx)
+	movl	%edi, 4(%edx)
+
+	addl	$8, %edx
+	addl	$2, %ecx
+L(no2):
+
+	popl	%edi
+	jnz	L(done)
+
+	movl	-4(%eax), %ecx
+
+	xorl	$GMP_NUMB_MASK, %ecx
+	popl	%esi
+
+	movl	%ecx, (%edx)
+	ret
+
+L(done):
+	popl	%esi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/copyd.asm b/third_party/gmp/mpn/x86/pentium/copyd.asm
new file mode 100644
index 0000000..72a543b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/copyd.asm

@@ -0,0 +1,146 @@
+dnl  Intel Pentium mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C See comments in copyi.asm.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	leal	-4(%eax,%ecx,4), %eax		C &src[size-1]
+	movl	PARAM_DST, %edx
+
+	subl	$7, %ecx			C size-7
+	jle	L(end)
+
+	movl	28-4(%edx,%ecx,4), %esi		C prefetch cache, dst[size-1]
+	nop
+
+L(top):
+	C eax	src, decrementing
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+	C esi	scratch
+	C edi	scratch
+	C ebp
+
+	movl	28-32(%edx,%ecx,4), %esi	C prefetch dst cache line
+	subl	$8, %ecx
+
+	movl	(%eax), %esi			C read words pairwise
+	movl	-4(%eax), %edi
+	movl	%esi, 56(%edx,%ecx,4)		C store words pairwise
+	movl	%edi, 52(%edx,%ecx,4)
+
+	movl	-8(%eax), %esi
+	movl	-12(%eax), %edi
+	movl	%esi, 48(%edx,%ecx,4)
+	movl	%edi, 44(%edx,%ecx,4)
+
+	movl	-16(%eax), %esi
+	movl	-20(%eax), %edi
+	movl	%esi, 40(%edx,%ecx,4)
+	movl	%edi, 36(%edx,%ecx,4)
+
+	movl	-24(%eax), %esi
+	movl	-28(%eax), %edi
+	movl	%esi, 32(%edx,%ecx,4)
+	movl	%edi, 28(%edx,%ecx,4)
+
+	leal	-32(%eax), %eax
+	jg	L(top)
+
+
+L(end):
+	C ecx	-7 to 0, representing respectively 0 to 7 limbs remaining
+	C eax	src end
+	C edx	dst, next location to store
+
+	addl	$4, %ecx
+	jle	L(no4)
+
+	movl	(%eax), %esi
+	movl	-4(%eax), %edi
+	movl	%esi, 8(%edx,%ecx,4)
+	movl	%edi, 4(%edx,%ecx,4)
+
+	movl	-8(%eax), %esi
+	movl	-12(%eax), %edi
+	movl	%esi, (%edx,%ecx,4)
+	movl	%edi, -4(%edx,%ecx,4)
+
+	subl	$16, %eax
+	subl	$4, %ecx
+L(no4):
+
+	addl	$2, %ecx
+	jle	L(no2)
+
+	movl	(%eax), %esi
+	movl	-4(%eax), %edi
+	movl	%esi, (%edx,%ecx,4)
+	movl	%edi, -4(%edx,%ecx,4)
+
+	subl	$8, %eax
+	subl	$2, %ecx
+L(no2):
+
+	jnz	L(done)
+
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)	C risk of cache bank clash here
+
+L(done):
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/copyi.asm b/third_party/gmp/mpn/x86/pentium/copyi.asm
new file mode 100644
index 0000000..d983d6b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/copyi.asm

@@ -0,0 +1,164 @@
+dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Destination prefetching is done to avoid repeated write-throughs on lines
+C not already in L1.
+C
+C At least one of the src or dst pointer needs to be incremented rather than
+C using indexing, so that there's somewhere to put the loop control without
+C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
+C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
+C subtracts in the finishup code.
+C
+C The block of finishup code is almost as big as the main loop itself, which
+C is unfortunate, but it's faster that way than with say rep movsl, by about
+C 10 cycles for instance on P55.
+C
+C There's nothing to be gained from MMX on P55, since it can do only one
+C movq load (or store) per cycle, so the throughput would be the same as the
+C code here (and even then only if src and dst have the same alignment mod
+C 8).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_DST, %edx
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	leal	(%edx,%ecx,4), %edx	C &dst[size-1]
+	xorl	$-1, %ecx		C -size-1
+
+	movl	PARAM_SRC, %esi
+	addl	$8, %ecx		C -size+7
+
+	jns	L(end)
+
+	movl	-28(%edx,%ecx,4), %eax	C fetch destination cache line, dst[0]
+	nop
+
+L(top):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, limbs, negative
+	C edx	&dst[size-1]
+	C esi	src, incrementing
+	C edi
+	C ebp
+
+	movl	(%edx,%ecx,4), %eax	C fetch destination cache line
+	addl	$8, %ecx
+
+	movl	(%esi), %eax		C read words pairwise
+	movl	4(%esi), %ebx
+	movl	%eax, -60(%edx,%ecx,4)	C store words pairwise
+	movl	%ebx, -56(%edx,%ecx,4)
+
+	movl	8(%esi), %eax
+	movl	12(%esi), %ebx
+	movl	%eax, -52(%edx,%ecx,4)
+	movl	%ebx, -48(%edx,%ecx,4)
+
+	movl	16(%esi), %eax
+	movl	20(%esi), %ebx
+	movl	%eax, -44(%edx,%ecx,4)
+	movl	%ebx, -40(%edx,%ecx,4)
+
+	movl	24(%esi), %eax
+	movl	28(%esi), %ebx
+	movl	%eax, -36(%edx,%ecx,4)
+	movl	%ebx, -32(%edx,%ecx,4)
+
+	leal	32(%esi), %esi
+	js	L(top)
+
+
+L(end):
+	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
+	C esi	src end
+	C edx	dst, next location to store
+
+	subl	$4, %ecx
+	jns	L(no4)
+
+	movl	(%esi), %eax
+	movl	4(%esi), %ebx
+	movl	%eax, -12(%edx,%ecx,4)
+	movl	%ebx, -8(%edx,%ecx,4)
+
+	movl	8(%esi), %eax
+	movl	12(%esi), %ebx
+	movl	%eax, -4(%edx,%ecx,4)
+	movl	%ebx, (%edx,%ecx,4)
+
+	addl	$16, %esi
+	addl	$4, %ecx
+L(no4):
+
+	subl	$2, %ecx
+	jns	L(no2)
+
+	movl	(%esi), %eax
+	movl	4(%esi), %ebx
+	movl	%eax, -4(%edx,%ecx,4)
+	movl	%ebx, (%edx,%ecx,4)
+
+	addl	$8, %esi
+	addl	$2, %ecx
+L(no2):
+
+	jnz	L(done)
+
+	movl	(%esi), %eax
+	movl	%eax, -4(%edx,%ecx,4)	C risk of cache bank clash here
+
+L(done):
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/dive_1.asm b/third_party/gmp/mpn/x86/pentium/dive_1.asm
new file mode 100644
index 0000000..21b5287
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/dive_1.asm

@@ -0,0 +1,264 @@
+dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C P54:  24.5  30.5   cycles/limb
+C P55:  23.0  28.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C Plain divl is used for small sizes, since the inverse takes a while to
+C setup.  Multiplying works out faster for size>=3 when the divisor is odd,
+C or size>=4 when the divisor is even.  Actually on P55 size==2 for odd or
+C size==3 for even are about the same speed for both divl or mul, but the
+C former is used since it will use up less code cache.
+C
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected.  On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi		FRAME_pushl()
+	push	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	andl	$1, %eax
+
+	movl	PARAM_DST, %edi
+	addl	%ecx, %eax	C size if even, size+1 if odd
+
+	cmpl	$4, %eax
+	jae	L(mul_by_inverse)
+
+
+	xorl	%edx, %edx
+L(div_top):
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, -4(%edi,%ecx,4)
+	decl	%ecx
+
+	jnz	L(div_top)
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+
+L(mul_by_inverse):
+	movl	PARAM_DIVISOR, %eax
+	movl	$-1, %ecx
+
+L(strip_twos):
+	ASSERT(nz, `orl %eax, %eax')
+	shrl	%eax
+	incl	%ecx			C shift count
+
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %edx	C d
+	andl	$127, %eax		C d/2, 7 bits
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`dnl
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	movl	%eax, %ebp		C inv
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	PARAM_SIZE, %ebx
+
+	movl	%eax, %ebp
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	%edx, PARAM_DIVISOR	C d without twos
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	leal	(%edi,%ebx,4), %edi	C dst end
+
+	negl	%ebx			C -size
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	xorl	%ebp, %ebp		C initial carry bit
+
+	movl	(%esi,%ebx,4), %eax	C src low limb
+	orl	%ecx, %ecx		C shift
+
+	movl	4(%esi,%ebx,4), %edx	C src second limb (for even)
+	jz	L(odd_entry)
+
+	shrdl(	%cl, %edx, %eax)
+
+	incl	%ebx
+	jmp	L(even_entry)
+
+
+	ALIGN(8)
+L(odd_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ebp, %edx
+
+	subl	%edx, %eax
+
+	sbbl	%ebp, %ebp
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	twos
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	subl	%ebp, %edx		C carry bit
+	movl	-4(%esi,%ebx,4), %eax	C src limb
+
+	movl	(%esi,%ebx,4), %ebp	C and one above it
+
+	shrdl(	%cl, %ebp, %eax)
+
+	subl	%edx, %eax		C carry limb
+
+	sbbl	%ebp, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+
+	jnz	L(even_top)
+
+
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax		C src high limb
+	subl	%ebp, %edx
+
+	shrl	%cl, %eax
+
+	subl	%edx, %eax		C no carry if division is exact
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)		C dst high limb
+	nop				C protect against cache bank clash
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/gmp-mparam.h b/third_party/gmp/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000..befa6e2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/gmp-mparam.h

@@ -0,0 +1,76 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 166MHz P54 */
+
+/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD             16
+#define MUL_TOOM33_THRESHOLD             90
+
+#define SQR_BASECASE_THRESHOLD            0  /* always */
+#define SQR_TOOM2_THRESHOLD              22
+#define SQR_TOOM3_THRESHOLD             122
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIV_DC_THRESHOLD                 52
+#define POWM_THRESHOLD                   77
+
+#define HGCD_THRESHOLD                  121
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                615
+#define JACOBI_BASE_METHOD                2
+
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  1  /* native */
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always (native) */
+
+#define GET_STR_DC_THRESHOLD             23
+#define GET_STR_PRECOMPUTE_THRESHOLD     33
+#define SET_STR_THRESHOLD              2788
+
+#define MUL_FFT_TABLE  { 432, 928, 1664, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          448
+#define MUL_FFT_THRESHOLD              3328
+
+#define SQR_FFT_TABLE  { 496, 928, 1920, 4608, 10240, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          512
+#define SQR_FFT_THRESHOLD              3328

diff --git a/third_party/gmp/mpn/x86/pentium/hamdist.asm b/third_party/gmp/mpn/x86/pentium/hamdist.asm
new file mode 100644
index 0000000..6c6c1a1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/hamdist.asm

@@ -0,0 +1,154 @@
+dnl  Intel P5 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
+C
+C It might be possible to shave 1 cycle from the loop, and hence 2
+C cycles/limb.  The xorb is taking 2 cycles, but a separate load and xor
+C would be 1, if the right schedule could be found (not found so far).
+C Wanting to avoid potential cache bank clashes makes it tricky.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
+C linking.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC1, 4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_hamdist)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi	FRAME_pushl()
+
+	shll	%ecx		C size in byte pairs
+	pushl	%edi	FRAME_pushl()
+
+ifdef(`PIC',`
+	pushl	%ebx	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+ifdef(`DARWIN',`
+	movl	PARAM_SRC1, %esi
+	movl	PARAM_SRC2, %edi
+	LEA(	TABLE_NAME, %ebp)
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+	xorl	%eax, %eax	C total
+',`
+	call	L(here)	FRAME_pushl()
+L(here):
+	movl	PARAM_SRC1, %esi
+	popl	%ebp	FRAME_popl()
+
+	movl	PARAM_SRC2, %edi
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+
+	movl	TABLE_NAME@GOT(%ebp), %ebp
+	xorl	%eax, %eax	C total
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+	movl	PARAM_SRC1, %esi
+	movl	PARAM_SRC2, %edi
+
+	xorl	%eax, %eax	C total
+	pushl	%ebx	FRAME_pushl()
+
+	xorl	%edx, %edx	C byte
+	xorl	%ebx, %ebx	C byte
+
+define(TABLE,`TABLE_NAME($1)')
+')
+
+
+	C The nop after the xorb seems necessary.  Although a movb might be
+	C expected to go down the V pipe in the second cycle of the xorb, it
+	C doesn't and costs an extra 2 cycles.
+L(top):
+	C eax	total
+	C ebx	byte
+	C ecx	counter, 2*size to 2
+	C edx	byte
+	C esi	src1
+	C edi	src2
+	C ebp	[PIC] table
+
+	addl	%ebx, %eax
+	movb	-1(%esi,%ecx,2), %bl
+
+	addl	%edx, %eax
+	movb	-1(%edi,%ecx,2), %dl
+
+	xorb	%dl, %bl
+	movb	-2(%esi,%ecx,2), %dl
+
+	xorb	-2(%edi,%ecx,2), %dl
+	nop
+
+	movb	TABLE(%ebx), %bl
+	decl	%ecx
+
+	movb	TABLE(%edx), %dl
+	jnz	L(top)
+
+
+ifdef(`PIC',`
+	popl	%ebp
+')
+	addl	%ebx, %eax
+	popl	%ebx
+
+	addl	%edx, %eax
+	popl	%edi
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/logops_n.asm b/third_party/gmp/mpn/x86/pentium/logops_n.asm
new file mode 100644
index 0000000..1877317
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/logops_n.asm

@@ -0,0 +1,176 @@
+dnl  Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 3.0 c/l  and, ior, xor
+C     3.5 c/l  andn, iorn, nand, nior, xnor
+
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4op',        `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n,     , andl,    )
+M4_choose_op( andn_n,    , andl, yes)
+M4_choose_op( nand_n, yes, andl,    )
+M4_choose_op( ior_n,     ,  orl,    )
+M4_choose_op( iorn_n,    ,  orl, yes)
+M4_choose_op( nior_n, yes,  orl,    )
+M4_choose_op( xor_n,     , xorl,    )
+M4_choose_op( xnor_n, yes, xorl,    )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+NAILS_SUPPORT(0-31)
+
+
+C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C Nothing complicated here, just some care to avoid data cache bank clashes
+C and AGIs.
+C
+C We're one register short of being able to do a simple 4 loads, 2 ops, 2
+C stores.  Instead %ebp is juggled a bit and nops are introduced to keep the
+C pairings as intended.  An in-place operation would free up a register, for
+C an 0.5 c/l speedup, if that's worth bothering with.
+C
+C This code seems best for P55 too.  Data alignment is a big problem for MMX
+C and the pairing restrictions on movq and integer instructions make life
+C difficult.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_YP,  12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_XP, %ebx
+
+	movl	PARAM_YP, %esi
+	movl	PARAM_WP, %edi
+
+	shrl	%ecx
+	jnc	L(entry)
+
+	movl	(%ebx,%ecx,8), %eax	C risk of data cache bank clash here
+	movl	(%esi,%ecx,8), %edx
+
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(%edx)')
+
+	M4op	%edx, %eax
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+	orl	%ecx, %ecx
+
+	movl	%eax, (%edi,%ecx,8)
+	jz	L(done)
+
+	jmp	L(entry)
+
+
+L(top):
+	C eax
+	C ebx	xp
+	C ecx	counter, limb pairs, decrementing
+	C edx
+	C esi	yp
+	C edi	wp
+	C ebp
+
+	M4op	%ebp, %edx
+	nop
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+M4post(`xorl	$GMP_NUMB_MASK, %edx')
+
+	movl	%eax, 4(%edi,%ecx,8)
+	movl	%edx, (%edi,%ecx,8)
+
+L(entry):
+	movl	-4(%ebx,%ecx,8), %ebp
+	nop
+
+	movl	-4(%esi,%ecx,8), %eax
+	movl	-8(%esi,%ecx,8), %edx
+
+M4pre(`	xorl	$GMP_NUMB_MASK, %eax')
+M4pre(`	xorl	$GMP_NUMB_MASK, %edx')
+
+	M4op	%ebp, %eax
+	movl	-8(%ebx,%ecx,8), %ebp
+
+	decl	%ecx
+	jnz	L(top)
+
+
+	M4op	%ebp, %edx
+	nop
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+M4post(`xorl	$GMP_NUMB_MASK, %edx')
+
+	movl	%eax, 4(%edi,%ecx,8)
+	movl	%edx, (%edi,%ecx,8)
+
+
+L(done):
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/lshift.asm b/third_party/gmp/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000..2a31f36
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/lshift.asm

@@ -0,0 +1,243 @@
+dnl  Intel Pentium mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C P5,P54:    6.0
+C P55:       5.375
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(%esi,%ebp,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl(	%cl, %ebx, %edx)
+	shldl(	%cl, %eax, %ebx)
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl(	%cl, %edx, %eax)
+	shldl(	%cl, %ebx, %edx)
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl(	%cl,%eax,%edx)
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		C compute least significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		C use leal not to clobber carry
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		C use leal not to clobber carry
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/third_party/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000..02a0def
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/gmp-mparam.h

@@ -0,0 +1,163 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 233MHz P55 */
+
+#define MOD_1_NORM_THRESHOLD                 5
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        11
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     63
+#define USE_PREINV_DIVREM_1                  0
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           51
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD                53
+#define MUL_TOOM44_THRESHOLD               128
+#define MUL_TOOM6H_THRESHOLD               189
+#define MUL_TOOM8H_THRESHOLD               260
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      90
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 20
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                210
+#define SQR_TOOM8_THRESHOLD                375
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               12
+
+#define MUL_FFT_MODF_THRESHOLD             364  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    364, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     47,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    167,10}, \
+    {     95, 9}, {    191, 8}, {    383,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    351,11}, \
+    {    191,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    287,10}, {    575,11}, \
+    {    351,12}, {    191,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    575,12}, {    319,11}, {    703,12}, \
+    {    383,11}, {    831,12}, {    447,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 90
+#define MUL_FFT_THRESHOLD                 3520
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     17, 7}, {      9, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     29, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     65, 8}, {     43, 9}, \
+    {     23, 8}, {     47,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 9}, {    135,10}, \
+    {     79, 9}, {    159, 8}, {    319,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207,12}, \
+    {     63,11}, {    127,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,10}, \
+    {    351,11}, {    191,10}, {    415,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    351,12}, {    191,11}, \
+    {    479,13}, {    127,12}, {    255,11}, {    575,12}, \
+    {    319,11}, {    703,12}, {    383,11}, {    767,12}, \
+    {    447,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 96
+#define SQR_FFT_THRESHOLD                 5504
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  48
+#define MULLO_MUL_N_THRESHOLD             6633
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             170
+#define DC_BDIV_QR_THRESHOLD                43
+#define DC_BDIV_Q_THRESHOLD                110
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               177
+#define INV_APPR_THRESHOLD                 171
+
+#define BINV_NEWTON_THRESHOLD              194
+#define REDC_1_TO_REDC_N_THRESHOLD          50
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               90
+#define MU_BDIV_QR_THRESHOLD               942
+#define MU_BDIV_Q_THRESHOLD               1017
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                      92
+#define GCD_DC_THRESHOLD                   283
+#define GCDEXT_DC_THRESHOLD                221
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                18
+#define GET_STR_PRECOMPUTE_THRESHOLD        31
+#define SET_STR_DC_THRESHOLD               490
+#define SET_STR_PRECOMPUTE_THRESHOLD       994

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/hamdist.asm b/third_party/gmp/mpn/x86/pentium/mmx/hamdist.asm
new file mode 100644
index 0000000..72e3196
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/hamdist.asm

@@ -0,0 +1,40 @@
+dnl  Intel P55 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P55: hamdist 12.0 cycles/limb
+
+C For reference, this code runs at 11.5 cycles/limb for popcount, which is
+C slower than the plain integer mpn/x86/pentium/popcount.asm.
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/lshift.asm b/third_party/gmp/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000..04b0ddc
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/lshift.asm

@@ -0,0 +1,463 @@
+dnl  Intel P5 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	movl	-4(%ebx,%eax,4), %edi	C src high limb
+	decl	%eax
+
+	jnz	L(simple)
+
+	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+	shll	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx,%eax,4), %mm5	C src high limb
+
+	movd	%ecx, %mm6		C lshift
+	negl	%ecx
+
+	psllq	%mm6, %mm5
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	$32, %mm5		C retval
+
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	movd	%mm0, 4(%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+
+	movd	%mm5, %eax
+	psllq	%mm6, %mm0
+
+	popl	%edi
+	popl	%ebx
+
+	movd	%mm0, (%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	-4(%ebx,%eax,4), %mm5	C src high limb
+	leal	(%ebx,%eax,4), %edi
+
+	movd	%ecx, %mm6		C lshift
+	andl	$4, %edi
+
+	psllq	%mm6, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb separately (marked xxx) to
+	C make it so.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+--
+	C  |  xxx  |       |
+	C  +-------+-------+--
+
+	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
+
+	psllq	%mm6, %mm0
+	decl	%eax
+
+	psrlq	$32, %mm0
+
+	C
+
+	movd	%mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+	movq	-8(%ebx,%eax,4), %mm1	C src high qword
+	leal	(%edx,%eax,4), %edi
+
+	andl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+	C is 32 bits extra.  High limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |      mm1      |
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+-------+--
+	C  |  xxx  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psllq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+	psrlq	$32, %mm0
+
+	C wasted cycle here waiting for %mm0
+
+	movd	%mm0, -4(%edx,%eax,4)
+	subl	$4, %edx
+L(start_dst_aligned):
+
+
+	psllq	%mm6, %mm1
+	negl	%ecx			C -shift
+
+	addl	$64, %ecx		C 64-shift
+	movq	%mm3, %mm2
+
+	movd	%ecx, %mm7
+	subl	$8, %eax		C size-8
+
+	psrlq	%mm7, %mm3
+
+	por	%mm1, %mm3		C mm3 ready to store
+	jc	L(finish)
+
+
+	C The comments in mpn_rshift apply here too.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from 16(%ebx,%eax,4)
+	C mm3	dst qword ready to store to 24(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	lshift
+	C mm7	rshift
+
+	movq	8(%ebx,%eax,4), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	movq	%mm3, 24(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	(%ebx,%eax,4), %mm3	C
+	psllq	%mm6, %mm1		C
+
+	movq	%mm0, 16(%edx,%eax,4)
+	movq	%mm3, %mm2		C
+
+	psrlq	%mm7, %mm3		C
+	subl	$4, %eax
+
+	por	%mm1, %mm3		C
+	jnc	L(unroll_loop)
+
+
+
+L(finish):
+	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %al
+
+	jz	L(finish_no_two)
+
+	movq	8(%ebx,%eax,4), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	movq	%mm3, 24(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	subl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
+	C
+	C mm2	src prev qword, from 16(%ebx,%eax,4)
+	C mm3	dst qword, for 24(%edx,%eax,4)
+
+	testb	$1, %al
+	movd	%mm5, %eax	C retval
+
+	popl	%edi
+	jz	L(finish_zero)
+
+
+	C One extra src limb, destination was aligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest         edx+12           edx+4     edx
+	C --+---------------+---------------+-------+
+	C   |      mm3      |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra src limb, destination was unaligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest         edx+12           edx+4
+	C         --+---------------+---------------+
+	C           |      mm3      |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 4(%edx), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+
+	movd	(%ebx), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm3, 12(%edx)
+	psllq	$32, %mm0
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm2, %mm0
+	psllq	%mm6, %mm1
+
+	movq	%mm0, 4(%edx)
+	psrlq	$32, %mm1
+
+	andl	$32, %ecx
+	popl	%ebx
+
+	jz	L(finish_one_unaligned)
+
+	movd	%mm1, (%edx)
+L(finish_one_unaligned):
+
+	emms
+
+	ret
+
+
+L(finish_zero):
+
+	C No extra src limbs, destination was aligned.
+	C
+	C                 source          ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest          edx+8             edx
+	C --+---------------+---------------+
+	C   |      mm3      |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra src limbs, destination was unaligned.
+	C
+	C               source            ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest          edx+8   edx+4
+	C         --+---------------+-------+
+	C           |      mm3      |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movd for the unaligned case writes the same data to 4(%edx)
+	C that the movq does for the aligned case.
+
+
+	movq	%mm3, 8(%edx)
+	andl	$32, %ecx
+
+	psllq	%mm6, %mm2
+	jz	L(finish_zero_unaligned)
+
+	movq	%mm2, (%edx)
+L(finish_zero_unaligned):
+
+	psrlq	$32, %mm2
+	popl	%ebx
+
+	movd	%mm5, %eax	C retval
+
+	movd	%mm2, 4(%edx)
+
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/mul_1.asm b/third_party/gmp/mpn/x86/pentium/mmx/mul_1.asm
new file mode 100644
index 0000000..4ced577
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/mul_1.asm

@@ -0,0 +1,371 @@
+dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    cycles/limb
+C P5:   12.0   for 32-bit multiplier
+C        7.0   for 16-bit multiplier
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C
+C When the multiplier is 16 bits some special case MMX code is used.  Small
+C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
+C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
+C size==8 end up being quite close.  If src isn't aligned to an 8 byte
+C boundary then one limb is processed separately with roughly a 5 cycle
+C penalty, so in that case it's say size==8 and size==9 which are close.
+C
+C Alternatives:
+C
+C MMX is not believed to be of any use for 32-bit multipliers, since for
+C instance the current method would just have to be more or less duplicated
+C for the high and low halves of the multiplier, and would probably
+C therefore run at about 14 cycles, which is slower than the plain integer
+C at 12.
+C
+C Adding the high and low MMX products using integer code seems best.  An
+C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
+C any joy.  Perhaps something could be done keeping the values signed and
+C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
+C perhaps not.
+C
+C Future:
+C
+C An mpn_mul_1c entrypoint would need a double carry out of the low result
+C limb in the 16-bit code, unless it could be assumed the carry fits in 16
+C bits, possibly as carry<multiplier, this being true of a big calculation
+C done piece by piece.  But let's worry about that if/when mul_1c is
+C actually used.
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	cmpl	$1, %ecx
+	jne	L(two_or_more)
+
+	C one limb only
+
+	movl	PARAM_MULTIPLIER, %eax
+	movl	PARAM_DST, %ecx
+
+	mull	(%edx)
+
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+
+	ret
+
+
+L(two_or_more):
+	C eax	size
+	C ebx
+	C ecx	carry
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%esi		FRAME_pushl()
+	pushl	%edi		FRAME_pushl()
+
+	movl	%edx, %esi		C src
+	movl	PARAM_DST, %edi
+
+	movl	PARAM_MULTIPLIER, %eax
+	pushl	%ebx		FRAME_pushl()
+
+	leal	(%esi,%ecx,4), %esi	C src end
+	leal	(%edi,%ecx,4), %edi	C dst end
+
+	negl	%ecx			C -size
+
+	pushl	%ebp		FRAME_pushl()
+	cmpl	$65536, %eax
+
+	jb	L(small)
+
+
+L(big):
+	xorl	%ebx, %ebx		C carry limb
+	sarl	%ecx			C -size/2
+
+	jnc	L(top)			C with carry flag clear
+
+
+	C size was odd, process one limb separately
+
+	mull	4(%esi,%ecx,8)		C m * src[0]
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	orl	%edx, %ebx		C carry limb, and clear carry flag
+
+
+L(top):
+	C eax
+	C ebx	carry
+	C ecx	counter, negative
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	(scratch carry)
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%edx, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %ebp
+	movl	4(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%ebx, (%edi,%ecx,8)
+	addl	%ebp, %eax
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(top)
+
+
+	adcl	$0, %ebx
+	popl	%ebp
+
+	movl	%ebx, %eax
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+L(small):
+	C Special case for 16-bit multiplier.
+	C
+	C eax	multiplier
+	C ebx
+	C ecx	-size
+	C edx	src
+	C esi	src end
+	C edi	dst end
+	C ebp	multiplier
+
+	C size<3 not supported here.  At size==3 we're already a couple of
+	C cycles faster, so there's no threshold as such, just use the MMX
+	C as soon as possible.
+
+	cmpl	$-3, %ecx
+	ja	L(big)
+
+	movd	%eax, %mm7		C m
+	pxor	%mm6, %mm6		C initial carry word
+
+	punpcklwd %mm7, %mm7		C m replicated 2 times
+	addl	$2, %ecx		C -size+2
+
+	punpckldq %mm7, %mm7		C m replicated 4 times
+	andl	$4, %edx		C test alignment, clear carry flag
+
+	movq	%mm7, %mm0		C m
+	jz	L(small_entry)
+
+
+	C Source is unaligned, process one limb separately.
+	C
+	C Plain integer code is used here, since it's smaller and is about
+	C the same 13 cycles as an mmx block would be.
+	C
+	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
+	C the use of separate incl and orl.
+
+	mull	-8(%esi,%ecx,4)		C m * src[0]
+
+	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
+	incl	%ecx			C one limb processed
+
+	movd	%edx, %mm6		C initial carry
+
+	orl	%eax, %eax		C clear carry flag
+	jmp	L(small_entry)
+
+
+C The scheduling here is quite tricky, since so many instructions have
+C pairing restrictions.  In particular the js won't pair with a movd, and
+C can't be paired with an adc since it wants flags from the inc, so
+C instructions are rotated to the top of the loop to find somewhere useful
+C for it.
+C
+C Trouble has been taken to avoid overlapping successive loop iterations,
+C since that would greatly increase the size of the startup and finishup
+C code.  Actually there's probably not much advantage to be had from
+C overlapping anyway, since the difficulties are mostly with pairing, not
+C with latencies as such.
+C
+C In the comments x represents the src data and m the multiplier (16
+C bits, but replicated 4 times).
+C
+C The m signs calculated in %mm3 are a loop invariant and could be held in
+C say %mm5, but that would save only one instruction and hence be no faster.
+
+L(small_top):
+	C eax	l.low, then l.high
+	C ebx	(h.low)
+	C ecx	counter, -size+2 to 0 or 1
+	C edx	(h.high)
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+	C
+	C %mm0	(high products)
+	C %mm1	(low products)
+	C %mm2	(adjust for m using x signs)
+	C %mm3	(adjust for x using m signs)
+	C %mm4
+	C %mm5
+	C %mm6	h.low, then carry
+	C %mm7	m replicated 4 times
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	movd	%mm0, %edx		C h.high
+	movq	%mm0, %mm6		C new c
+
+	adcl	%eax, %ebx
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.high
+	movq	%mm7, %mm0
+
+	adcl	%eax, %edx
+	movl	%ebx, -16(%edi,%ecx,4)
+
+	movl	%edx, -12(%edi,%ecx,4)
+	psrlq	$32, %mm6		C c
+
+L(small_entry):
+	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
+	movq	%mm7, %mm1
+
+	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
+	movq	%mm7, %mm3
+
+	movq	-8(%esi,%ecx,4), %mm2	C x
+	psraw	$15, %mm3		C m signs
+
+	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
+	psraw	$15, %mm2		C x signs
+
+	paddw	%mm3, %mm0		C add x to h if m neg
+	pand	%mm7, %mm2		C m selected by x signs
+
+	paddw	%mm2, %mm0		C add m to h if x neg
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.low
+	punpcklwd %mm0, %mm6		C c + h.low << 16
+
+	psrlq	$16, %mm0		C h.high
+	js	L(small_top)
+
+
+
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	adcl	%eax, %ebx
+	popl	%ebp		FRAME_popl()
+
+	movd	%mm0, %edx		C h.high
+	psrlq	$32, %mm0		C l.high
+
+	movd	%mm1, %eax		C l.high
+
+	adcl	%eax, %edx
+	movl	%ebx, -12(%edi,%ecx,4)
+
+	movd	%mm0, %eax		C c
+
+	adcl	$0, %eax
+	movl	%edx, -8(%edi,%ecx,4)
+
+	orl	%ecx, %ecx
+	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd
+
+
+	C Size odd, one extra limb to process.
+	C Plain integer code is used here, since it's smaller and is about
+	C the same speed as another mmx block would be.
+
+	movl	%eax, %ecx
+	movl	PARAM_MULTIPLIER, %eax
+
+	mull	-4(%esi)
+
+	addl	%ecx, %eax
+
+	adcl	$0, %edx
+	movl	%eax, -4(%edi)
+
+	movl	%edx, %eax
+L(small_done):
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/rshift.asm b/third_party/gmp/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000..e3b274b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/rshift.asm

@@ -0,0 +1,468 @@
+dnl  Intel P5 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned.  Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	decl	%eax
+	movl	(%ebx), %edi		C src low limb
+
+	jnz	L(simple)
+
+	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+	shrl	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
+
+	movd	%ecx, %mm6		C rshift
+	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
+
+	psllq	$32, %mm5
+	negl	%eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop.  Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-1]
+	C ecx	return value
+	C edx	&dst[size-2]
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+
+	movq	(%ebx,%eax,4), %mm0
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+	psrlq	%mm6, %mm5		C return value
+
+	psrlq	%mm6, %mm0
+	popl	%edi
+
+	movd	%mm5, %eax
+	popl	%ebx
+
+	movd	%mm0, 4(%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	movl	$4, %edi
+
+	movd	%ecx, %mm6		C rshift
+	testl	%edi, %ebx
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  ebx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edx
+	C         --+-------+-------+
+	C           |       |  xxx  |
+	C         --+-------+-------+
+
+	movq	(%ebx), %mm0		C unaligned load
+
+	psrlq	%mm6, %mm0
+	addl	$4, %ebx
+
+	decl	%eax
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_src_aligned):
+
+
+	movq	(%ebx), %mm1
+	testl	%edi, %edx
+
+	psrlq	%mm6, %mm5		C retval
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C          source          ebx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edx
+	C  --+-------+-------+-------+
+	C                    |  xxx  |
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psrlq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_dst_aligned):
+
+
+	movq	8(%ebx), %mm3
+	negl	%ecx
+
+	movq	%mm3, %mm2		C mm2 src qword
+	addl	$64, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	%mm6, %mm1
+
+	leal	-12(%ebx,%eax,4), %ebx
+	leal	-20(%edx,%eax,4), %edx
+
+	psllq	%mm7, %mm3
+	subl	$7, %eax		C size-7
+
+	por	%mm1, %mm3		C mm3 ready to store
+	negl	%eax			C -(size-7)
+
+	jns	L(finish)
+
+
+	C This loop is the important bit, the rest is just support.  Careful
+	C instruction scheduling achieves the claimed 1.75 c/l.  The
+	C relevant parts of the pairing rules are:
+	C
+	C - mmx loads and stores execute only in the U pipe
+	C - only one mmx shift in a pair
+	C - wait one cycle before storing an mmx register result
+	C - the usual address generation interlock
+	C
+	C Two qword calculations are slightly interleaved.  The instructions
+	C marked "C" belong to the second qword, and the "C prev" one is for
+	C the second qword from the previous iteration.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-12]
+	C ecx
+	C edx	&dst[size-12]
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from -8(%ebx,%eax,4)
+	C mm3	dst qword ready to store to -8(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	rshift
+	C mm7	lshift
+
+	movq	(%ebx,%eax,4), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, -8(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	8(%ebx,%eax,4), %mm3	C
+	psrlq	%mm6, %mm1		C
+
+	movq	%mm0, (%edx,%eax,4)
+	movq	%mm3, %mm2		C
+
+	psllq	%mm7, %mm3		C
+	addl	$4, %eax
+
+	por	%mm1, %mm3		C
+	js	L(unroll_loop)
+
+
+L(finish):
+	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %al
+
+	jnz	L(finish_no_two)
+
+	movq	(%ebx,%eax,4), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, -8(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	addl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
+	C
+	C mm2	src prev qword, from -8(%ebx,%eax,4)
+	C mm3	dst qword, for -8(%edx,%eax,4)
+
+	testb	$1, %al
+	popl	%edi
+
+	movd	%mm5, %eax	C retval
+	jnz	L(finish_zero)
+
+
+	C One extra limb, destination was aligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                                  edx
+	C +-------+---------------+---------------+--
+	C |       |               |      mm3      |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra limb, destination was unaligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                          edx
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 8(%edx), and in the aligned case
+	C there's a further extra limb of dst to be formed.
+
+
+	movd	8(%ebx), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, (%edx)
+	por	%mm2, %mm0
+
+	psrlq	%mm6, %mm1
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_one_unaligned)
+
+	C dst was aligned, must store one extra limb
+	movd	%mm1, 16(%edx)
+L(finish_one_unaligned):
+
+	movq	%mm0, 8(%edx)
+
+	emms
+
+	ret
+
+
+L(finish_zero):
+
+	C No extra limbs, destination was aligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                        edx+4
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra limbs, destination was unaligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                edx+4
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is clearly the same data as the
+	C movq for the aligned case, it's just a choice between whether one
+	C or two limbs should be written.
+
+
+	movq	%mm3, 4(%edx)
+	psrlq	%mm6, %mm2
+
+	movd	%mm2, 12(%edx)
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_zero_unaligned)
+
+	movq	%mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mod_34lsub1.asm b/third_party/gmp/mpn/x86/pentium/mod_34lsub1.asm
new file mode 100644
index 0000000..2d88223
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mod_34lsub1.asm

@@ -0,0 +1,192 @@
+dnl  Intel P5 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+
+	movl	(%edx), %eax
+	jne	L(one)
+
+
+	movl	4(%edx), %ecx
+	movl	%eax, %edx
+
+	shrl	$24, %edx
+	andl	$0xFFFFFF, %eax
+
+	addl	%edx, %eax
+	movl	%ecx, %edx
+
+	shrl	$16, %ecx
+	andl	$0xFFFF, %edx
+
+	shll	$8, %edx
+	addl	%ecx, %eax
+
+	addl	%edx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+	C ebp
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	xorl	%esi, %esi		C 0mod3
+	xorl	%edi, %edi		C 1mod3
+
+	xorl	%ebp, %ebp		C 2mod3, and clear carry
+
+L(top):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, limbs
+	C edx	src
+	C esi	0mod3
+	C edi	1mod3
+	C ebp	2mod3
+
+	movl	(%edx), %eax
+	movl	4(%edx), %ebx
+
+	adcl	%eax, %esi
+	movl	8(%edx), %eax
+
+	adcl	%ebx, %edi
+	leal	12(%edx), %edx
+
+	adcl	%eax, %ebp
+	leal	-2(%ecx), %ecx
+
+	decl	%ecx
+	jg	L(top)
+
+
+	C ecx is -2, -1 or 0, representing 0, 1 or 2 more limbs, respectively
+
+	movl	$0xFFFFFFFF, %ebx	C mask
+	incl	%ecx
+
+	js	L(combine)		C 0 more
+
+	movl	(%edx), %eax
+	movl	$0xFFFFFF00, %ebx
+
+	adcl	%eax, %esi
+	decl	%ecx
+
+	js	L(combine)		C 1 more
+
+	movl	4(%edx), %eax
+	movl	$0xFFFF0000, %ebx
+
+	adcl	%eax, %edi
+
+
+
+L(combine):
+	C eax
+	C ebx	mask
+	C ecx
+	C edx
+	C esi	0mod3
+	C edi	1mod3
+	C ebp	2mod3
+
+	sbbl	%ecx, %ecx		C carry
+	movl	%esi, %eax		C 0mod3
+
+	andl	%ebx, %ecx		C masked for position
+	andl	$0xFFFFFF, %eax		C 0mod3 low
+
+	shrl	$24, %esi		C 0mod3 high
+	subl	%ecx, %eax		C apply carry
+
+	addl	%esi, %eax		C apply 0mod3
+	movl	%edi, %ebx		C 1mod3
+
+	shrl	$16, %edi		C 1mod3 high
+	andl	$0x0000FFFF, %ebx
+
+	shll	$8, %ebx		C 1mod3 low
+	addl	%edi, %eax		C apply 1mod3 high
+
+	addl	%ebx, %eax		C apply 1mod3 low
+	movl	%ebp, %ebx		C 2mod3
+
+	shrl	$8, %ebp		C 2mod3 high
+	andl	$0xFF, %ebx
+
+	shll	$16, %ebx		C 2mod3 low
+	addl	%ebp, %eax		C apply 2mod3 high
+
+	addl	%ebx, %eax		C apply 2mod3 low
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mode1o.asm b/third_party/gmp/mpn/x86/pentium/mode1o.asm
new file mode 100644
index 0000000..a90abca
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mode1o.asm

@@ -0,0 +1,279 @@
+dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2002, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 23.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C There seems no way to pair up the two lone instructions in the main loop.
+C
+C The special case for size==1 saves about 20 cycles (non-PIC), making it
+C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
+C all sizes.
+C
+C Alternatives:
+C
+C Using mmx for the multiplies might be possible, with pmullw and pmulhw
+C having just 3 cycle latencies, but carry bit handling would probably be
+C complicated.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+dnl  re-using parameter space
+define(VAR_INVERSE,`PARAM_SIZE')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	movl	PARAM_CARRY, %edx
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	xorl	%edx, %edx		C carry
+
+L(start_1c):
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+	shrl	%eax			C d/2
+	LEA(	binvert_limb_table, %ecx)
+	pushl	%ebx		FRAME_pushl()
+	movl	PARAM_SIZE, %ebx
+
+	andl	$127, %eax
+	subl	$2, %ebx
+
+	movb	(%eax,%ecx), %cl
+	jc	L(one_limb)
+',`
+	call	L(here)		FRAME_pushl()
+L(here):
+
+	shrl	%eax			C d/2
+	movl	(%esp), %ecx		C eip
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
+	movl	%ebx, (%esp)		C push ebx
+
+	andl	$127, %eax
+	movl	PARAM_SIZE, %ebx
+
+	movl	binvert_limb_table@GOT(%ecx), %ecx
+	subl	$2, %ebx
+
+	movb	(%eax,%ecx), %cl			C inv 8 bits
+	jc	L(one_limb)
+')
+',`
+dnl non-PIC
+	shrl	%eax			C d/2
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebx
+	andl	$127, %eax
+
+	subl	$2, %ebx
+	jc	L(one_limb)
+
+	movb	binvert_limb_table(%eax), %cl		C inv 8 bits
+')
+
+	movl	%ecx, %eax
+	addl	%ecx, %ecx		C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
+
+	movl	%ecx, %eax
+	addl	%ecx, %ecx		C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
+	pushl	%esi		FRAME_pushl()
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	%ecx, %eax
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax')
+
+	movl	PARAM_SRC, %esi
+	movl	%ecx, VAR_INVERSE
+
+	movl	(%esi), %eax		C src[0]
+	leal	4(%esi,%ebx,4), %esi	C &src[size-1]
+
+	xorl	$-1, %ebx		C -(size-1)
+	ASSERT(nz)
+	jmp	L(entry)
+
+
+C The use of VAR_INVERSE means only a store is needed for that value, rather
+C than a push and pop of say %edi.
+
+	ALIGN(16)
+L(top):
+	C eax	scratch, low product
+	C ebx	counter, limbs, negative
+	C ecx	carry bit
+	C edx	scratch, high product
+	C esi	&src[size-1]
+	C edi
+	C ebp
+
+	mull	PARAM_DIVISOR		C h:dummy = q*d
+
+	movl	(%esi,%ebx,4), %eax	C src[i]
+	subl	%ecx, %edx		C h -= -c
+
+L(entry):
+	subl	%edx, %eax		C s = src[i] - h
+
+	sbbl	%ecx, %ecx		C new -c (0 or -1)
+
+	imull	VAR_INVERSE, %eax	C q = s*i
+
+	incl	%ebx
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi), %eax		C src high
+	subl	%ecx, %edx		C h -= -c
+
+	cmpl	PARAM_DIVISOR, %eax
+
+	jbe	L(skip_last)
+deflit(FRAME_LAST,FRAME)
+
+
+	subl	%edx, %eax		C s = src[i] - h
+	popl	%esi		FRAME_popl()
+
+	sbbl	%ecx, %ecx		C c (0 or -1)
+	popl	%ebx		FRAME_popl()
+
+	imull	VAR_INVERSE, %eax	C q = s*i
+
+	mull	PARAM_DIVISOR		C h:dummy = q*d
+
+	movl	%edx, %eax
+
+	subl	%ecx, %eax
+
+	ret
+
+
+C When high<divisor can skip last step.
+
+L(skip_last):
+deflit(`FRAME',FRAME_LAST)
+	C eax	src high
+	C ebx
+	C ecx
+	C edx	r
+	C esi
+
+	subl	%eax, %edx	C r-s
+	popl	%esi		FRAME_popl()
+
+	sbbl	%eax, %eax	C -1 if underflow
+	movl	PARAM_DIVISOR, %ebx
+
+	andl	%ebx, %eax	C divisor if underflow
+	popl	%ebx		FRAME_popl()
+
+	addl	%edx, %eax	C addback if underflow
+
+	ret
+
+
+C Special case for size==1 using a division for r = c-a mod d.
+C Could look for a-c<d and save a division sometimes, but that doesn't seem
+C worth bothering about.
+
+L(one_limb):
+deflit(`FRAME',4)
+	C eax
+	C ebx	size-2 (==-1)
+	C ecx
+	C edx	carry
+	C esi	src end
+	C edi
+	C ebp
+
+	movl	%edx, %eax
+	movl	PARAM_SRC, %edx
+
+	movl	PARAM_DIVISOR, %ecx
+	popl	%ebx		FRAME_popl()
+
+	subl	(%edx), %eax		C c-a
+
+	sbbl	%edx, %edx
+	decl	%ecx			C d-1
+
+	andl	%ecx, %edx		C b*d+c-a if c<a, or c-a if c>=a
+
+	divl	PARAM_DIVISOR
+
+	movl	%edx, %eax
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/mul_1.asm b/third_party/gmp/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000..a0858af
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mul_1.asm

@@ -0,0 +1,177 @@
+dnl  Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 12.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+	pushl	%esi		FRAME_pushl()
+
+L(start_1c):
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %eax
+
+	shrl	%eax
+	jnz	L(two_or_more)
+
+
+	C one limb only
+
+	movl	(%esi), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%eax, %ecx
+	movl	PARAM_DST, %eax
+
+	adcl	$0, %edx
+	popl	%esi
+
+	movl	%ecx, (%eax)
+	movl	%edx, %eax
+
+	ret
+
+
+L(two_or_more):
+	C eax	size/2
+	C ebx
+	C ecx	carry
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	leal	-1(%eax), %ebx		C size/2-1
+
+	notl	%ebx			C -size, preserve carry
+
+	leal	(%esi,%eax,8), %esi	C src end
+	leal	(%edi,%eax,8), %edi	C dst end
+
+	pushl	%ebp		FRAME_pushl()
+	jnc	L(top)
+
+
+	C size was odd, process one limb separately
+
+	movl	(%esi,%ebx,8), %eax
+	addl	$4, %esi
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%ecx, %eax
+	movl	%edx, %ecx
+
+	movl	%eax, (%edi,%ebx,8)
+	leal	4(%edi), %edi
+
+
+L(top):
+	C eax
+	C ebx	counter, negative
+	C ecx	carry
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp
+
+	adcl	$0, %ecx
+	movl	(%esi,%ebx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%edx, %ebp
+	addl	%eax, %ecx
+
+	adcl	$0, %ebp
+	movl	4(%esi,%ebx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%ecx, (%edi,%ebx,8)
+	addl	%ebp, %eax
+
+	movl	%eax, 4(%edi,%ebx,8)
+	incl	%ebx
+
+	movl	%edx, %ecx
+	jnz	L(top)
+
+
+	adcl	$0, %ecx
+	popl	%ebp
+
+	movl	%ecx, %eax
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mul_2.asm b/third_party/gmp/mpn/x86/pentium/mul_2.asm
new file mode 100644
index 0000000..4c7beb5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mul_2.asm

@@ -0,0 +1,150 @@
+dnl  Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 24.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_2 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_srcptr mult);
+C
+C At 24 c/l this is only 2 cycles faster than a separate mul_1 and addmul_1,
+C but has the advantage of making just one pass over the operands.
+C
+C There's not enough registers to use PARAM_MULT directly, so the multiplier
+C limbs are transferred to local variables on the stack.
+
+defframe(PARAM_MULT, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  re-use parameter space
+define(VAR_MULT_LOW, `PARAM_SRC')
+define(VAR_MULT_HIGH,`PARAM_DST')
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_2)
+deflit(`FRAME',0)
+
+	pushl	%esi		FRAME_pushl()
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	movl	PARAM_MULT, %eax
+	movl	PARAM_SIZE, %ecx
+
+	movl	4(%eax), %edx		C mult high
+	movl	(%eax), %eax		C mult low
+
+	movl	%eax, VAR_MULT_LOW
+	movl	%edx, VAR_MULT_HIGH
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+	mull	(%esi)			C src[0] * mult[0]
+
+	movl	%eax, %ebp		C in case src==dst
+	movl	(%esi), %eax		C src[0]
+
+	movl	%ebp, (%edi)		C dst[0]
+	movl	%edx, %ebx		C initial low carry
+
+	xorl	%ebp, %ebp		C initial high carry
+	leal	(%edi,%ecx,4), %edi	C dst end
+
+	mull	VAR_MULT_HIGH		C src[0] * mult[1]
+
+	subl	$2, %ecx		C size-2
+	js	L(done)
+
+	leal	8(%esi,%ecx,4), %esi	C &src[size]
+	xorl	$-1, %ecx		C -(size-1)
+
+
+
+L(top):
+	C eax	low prod
+	C ebx	low carry
+	C ecx	counter, negative
+	C edx	high prod
+	C esi	src end
+	C edi	dst end
+	C ebp	high carry (0 or -1)
+
+	andl	$1, %ebp		C 1 or 0
+	addl	%eax, %ebx
+
+	adcl	%edx, %ebp
+	ASSERT(nc)
+	movl	(%esi,%ecx,4), %eax
+
+	mull	VAR_MULT_LOW
+
+	addl	%eax, %ebx		C low carry
+	movl	(%esi,%ecx,4), %eax
+
+	adcl	%ebp, %edx		C high carry
+	movl	%ebx, (%edi,%ecx,4)
+
+	sbbl	%ebp, %ebp		C new high carry, -1 or 0
+	movl	%edx, %ebx		C new low carry
+
+	mull	VAR_MULT_HIGH
+
+	incl	%ecx
+	jnz	L(top)
+
+
+L(done):
+	andl	$1, %ebp		C 1 or 0
+	addl	%ebx, %eax
+
+	adcl	%ebp, %edx
+	ASSERT(nc)
+	movl	%eax, (%edi)		C store carry low
+
+	movl	%edx, %eax		C return carry high
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mul_basecase.asm b/third_party/gmp/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000..e1d0f05
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mul_basecase.asm

@@ -0,0 +1,142 @@
+dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+
+dnl  Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.2 cycles/crossproduct (approx)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP,    16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP,    8)
+defframe(PARAM_WP,    4)
+
+defframe(VAR_COUNTER, -4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+	pushl	%eax			C dummy push for allocating stack slot
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',16)
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	movl	PARAM_XSIZE,%eax
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+	leal	(%esi,%eax,4),%esi	C make xp point at end
+	leal	(%edi,%eax,4),%edi	C offset wp by xsize
+	negl	%ecx			C negate j size/index for inner loop
+	xorl	%eax,%eax		C clear carry
+
+	ALIGN(8)
+L(oop1):	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%eax,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop1)
+
+	adcl	$0,%ebx
+	movl	PARAM_YSIZE,%eax
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	addl	$4,%ebp			C make ebp point to next y limb
+	movl	PARAM_XSIZE,%ecx
+	negl	%ecx
+	xorl	%ebx,%ebx
+
+	C code at 0x61 here, close enough to aligned
+L(oop2):
+	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	(%edi,%ecx,4),%ebx
+	adcl	$0,%edx
+	addl	%eax,%ebx
+	movl	%ebx,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop2)
+
+	adcl	$0,%ebx
+
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$4,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	popl	%eax		C dummy pop for deallocating stack slot
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/popcount.asm b/third_party/gmp/mpn/x86/pentium/popcount.asm
new file mode 100644
index 0000000..0e82144
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/popcount.asm

@@ -0,0 +1,146 @@
+dnl  Intel P5 mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 8.0 cycles/limb
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C
+C An arithmetic approach has been found to be slower than the table lookup,
+C due to needing too many instructions.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: exporting the table to hamdist is incorrect as it hurt incremental
+C linking.
+
+	RODATA
+	ALIGN(8)
+	GLOBL	TABLE_NAME
+TABLE_NAME:
+forloop(i,0,255,
+`	.byte	m4_popcount(i)
+')
+
+defframe(PARAM_SIZE,8)
+defframe(PARAM_SRC, 4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_popcount)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi	FRAME_pushl()
+
+ifdef(`PIC',`
+	pushl	%ebx	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+ifdef(`DARWIN',`
+	shll	%ecx		C size in byte pairs
+	LEA(	TABLE_NAME, %ebp)
+	movl	PARAM_SRC, %esi
+	xorl	%eax, %eax	C total
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+',`
+	call	L(here)
+L(here):
+	popl	%ebp
+	shll	%ecx		C size in byte pairs
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+	movl	PARAM_SRC, %esi
+
+	xorl	%eax, %eax	C total
+	xorl	%ebx, %ebx	C byte
+
+	movl	TABLE_NAME@GOT(%ebp), %ebp
+	xorl	%edx, %edx	C byte
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+	shll	%ecx		C size in byte pairs
+	movl	PARAM_SRC, %esi
+
+	pushl	%ebx	FRAME_pushl()
+	xorl	%eax, %eax	C total
+
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+
+define(TABLE,`TABLE_NAME`'($1)')
+')
+
+
+	ALIGN(8)	C necessary on P55 for claimed speed
+L(top):
+	C eax	total
+	C ebx	byte
+	C ecx	counter, 2*size to 2
+	C edx	byte
+	C esi	src
+	C edi
+	C ebp	[PIC] table
+
+	addl	%ebx, %eax
+	movb	-1(%esi,%ecx,2), %bl
+
+	addl	%edx, %eax
+	movb	-2(%esi,%ecx,2), %dl
+
+	movb	TABLE(%ebx), %bl
+	decl	%ecx
+
+	movb	TABLE(%edx), %dl
+	jnz	L(top)
+
+
+ifdef(`PIC',`
+	popl	%ebp
+')
+	addl	%ebx, %eax
+	popl	%ebx
+
+	addl	%edx, %eax
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/rshift.asm b/third_party/gmp/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000..2105c4c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/rshift.asm

@@ -0,0 +1,243 @@
+dnl  Intel Pentium mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C P5,P54:    6.0
+C P55:       5.375
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
+	leal	(%edi,%ebp,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		C jump if s_ptr >= res_ptr + size
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl(	%cl, %ebx, %edx)
+	shrdl(	%cl, %eax, %ebx)
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl(	%cl, %edx, %eax)
+	shrdl(	%cl, %ebx, %edx)
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl(	%cl,%eax,%edx)		C compute result limb
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		C compute most significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,(%edi)
+	rcrl	%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	%ebx
+	movl	%edx,-8(%edi)
+	rcrl	%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	%edx
+	movl	%eax,-16(%edi)
+	rcrl	%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,-24(%edi)
+	rcrl	%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		C use leal not to clobber carry
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		C use leal not to clobber carry
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	movl	$0,%eax
+	rcrl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/sqr_basecase.asm b/third_party/gmp/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000..b11d767
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/sqr_basecase.asm

@@ -0,0 +1,528 @@
+dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+C product at around 20x20 limbs.
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	pushl	%esi
+	pushl	%ebx
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	%edx, %esi	C dst[1]
+
+	movl	4(%ebx), %eax
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+	popl	%ebx
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+	popl	%esi
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	cmpl	$4, %edx
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_SRC, %ebx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%ebx), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+
+	movl	8(%ebx), %eax
+	pushl	%esi		C risk of cache bank clash
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movl	(%ebx), %eax
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%ebx), %eax
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %ebx
+	addl	%esi, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %esi
+
+	adcl	%ebp, %esi
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	%esi, 12(%ecx)
+
+	adcl	%edx, %edi
+	popl	%esi
+
+	movl	20(%ecx), %eax
+	movl	%edi, 16(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	adcl	%ebx, %eax	C no carry out of this
+	popl	%ebx
+
+	movl	%eax, 20(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(four_or_more):
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+	C esi
+	C edi
+	C ebp
+	C
+	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+	pushl	%edi
+FRAME_pushl()
+	pushl	%esi
+FRAME_pushl()
+
+	pushl	%ebp
+FRAME_pushl()
+	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
+
+	leal	(%ebx,%edx,4), %esi	C src end
+	movl	%ebx, %ebp		C src
+
+	negl	%edx			C -size
+	xorl	%ebx, %ebx		C clear carry limb and carry flag
+
+	leal	1(%edx), %ecx		C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	src
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(mul1)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last two products, which are the end corner of the product
+	C triangle, are handled separately to save looping overhead.  These
+	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+	C If size is 4 then it's only these that need to be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	adcl	$0, %ebx
+	movl	PARAM_SIZE, %edx
+
+	movl	%ebx, (%edi)
+	subl	$4, %edx
+
+	negl	%edx
+	jz	L(corner)
+
+
+L(outer):
+	C ebx	previous carry limb to store
+	C edx	outer loop counter (negative)
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+
+	pushl	%edx			C new outer loop counter
+	leal	-2(%edx), %ecx
+
+	movl	%ebx, (%edi)
+	addl	$4, %edi
+
+	addl	$4, %ebp
+	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	&src[j]
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(inner)
+
+
+	adcl	$0, %ebx
+	popl	%edx		C outer loop counter
+
+	incl	%edx
+	jnz	L(outer)
+
+
+	movl	%ebx, (%edi)
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+
+	movl	-8(%esi), %eax
+	movl	-4(%edi), %ebx		C risk of data cache bank clash here
+
+	mull	-12(%esi)		C src[size-2]*src[size-3]
+
+	addl	%eax, %ebx
+	movl	%edx, %ecx
+
+	adcl	$0, %ecx
+	movl	-4(%esi), %eax
+
+	mull	-12(%esi)		C src[size-1]*src[size-3]
+
+	addl	%ecx, %eax
+	movl	(%edi), %ecx
+
+	adcl	$0, %edx
+	movl	%ebx, -4(%edi)
+
+	addl	%eax, %ecx
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	movl	-4(%esi), %eax
+
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+
+	movl	%ecx, (%edi)
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	movl	PARAM_SIZE, %eax
+
+	negl	%eax
+	movl	%ebx, 4(%edi)
+
+	addl	$1, %eax		C -(size-1) and clear carry
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	movl	12(%edi,%eax,8), %ebx
+
+	rcll	%ebx
+	movl	16(%edi,%eax,8), %ecx
+
+	rcll	%ecx
+	movl	%ebx, 12(%edi,%eax,8)
+
+	movl	%ecx, 16(%edi,%eax,8)
+	incl	%eax
+
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	PARAM_SRC, %esi
+
+	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
+	movl	%eax, 12(%edi)		C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx
+
+	mull	%eax
+
+	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
+	movl	%edx, %ebx
+
+	addl	$1, %ecx		C size-1 and clear carry
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, negative
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %ebx
+
+	mull	%eax
+
+	movl	16-4(%edi,%ecx,8), %ebp
+
+	addl	%ebp, %ebx
+	movl	16(%edi,%ecx,8), %ebp
+
+	adcl	%eax, %ebp
+	movl	%ebx, 16-4(%edi,%ecx,8)
+
+	movl	%ebp, 16(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(diag)
+
+
+	adcl	$0, %edx
+	movl	16-4(%edi), %eax	C dst most significant limb
+
+	addl	%eax, %edx
+	popl	%ebp
+
+	movl	%edx, 16-4(%edi)
+	popl	%esi		C risk of cache bank clash
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/README b/third_party/gmp/mpn/x86/pentium4/README
new file mode 100644
index 0000000..90f752e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/README

@@ -0,0 +1,124 @@
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+                   INTEL PENTIUM-4 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium-4.
+
+The mmx subdirectory has routines using MMX instructions, the sse2
+subdirectory has routines using SSE2 instructions.  All P4s have these, the
+separate directories are just so configure can omit that code if the
+assembler doesn't support it.
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            4 normal, 6 in-place
+
+	mpn_mul_1                  4 normal, 6 in-place
+	mpn_addmul_1               6
+	mpn_submul_1               7
+
+	mpn_mul_basecase           6 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           3.5 cycles/crossproduct (approx)
+                                   or 7.0 cycles/triangleproduct (approx)
+
+	mpn_l/rshift               1.75
+
+
+
+The shifts ought to be able to go at 1.5 c/l, but not much effort has been
+applied to them yet.
+
+In-place operations, and all addmul, submul, mul_basecase and sqr_basecase
+calls, suffer from pipeline anomalies associated with write combining and
+movd reads and writes to the same or nearby locations.  The movq
+instructions do not trigger the same hardware problems.  Unfortunately,
+using movq and splitting/combining seems to require too many extra
+instructions to help.  Perhaps future chip steppings will be better.
+
+
+
+NOTES
+
+The Pentium-4 pipeline "Netburst", provides for quite a number of surprises.
+Many traditional x86 instructions run very slowly, requiring use of
+alterative instructions for acceptable performance.
+
+adcl and sbbl are quite slow at 8 cycles for reg->reg.  paddq of 32-bits
+within a 64-bit mmx register seems better, though the combination
+paddq/psrlq when propagating a carry is still a 4 cycle latency.
+
+incl and decl should be avoided, instead use add $1 and sub $1.  Apparently
+the carry flag is not separately renamed, so incl and decl depend on all
+previous flags-setting instructions.
+
+shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest
+integer instructions (addl, subl, orl, andl, and some more).  shldl and
+shrdl seem to have 13 and 15 cycles latency, respectively.  Bizarre.
+
+movq mmx -> mmx does have 6 cycle latency, as noted in the documentation.
+pxor/por or similar combination at 2 cycles latency can be used instead.
+The movq however executes in the float unit, thereby saving MMX execution
+resources.  With the right juggling, data moves shouldn't be on a dependent
+chain.
+
+L1 is write-through, but the write-combining sounds like it does enough to
+not require explicit destination prefetching.
+
+xmm registers so far haven't found a use, but not much effort has been
+expended.  A configure test for whether the operating system knows
+fxsave/fxrestor will be needed if they're used.
+
+
+
+REFERENCES
+
+Intel Pentium-4 processor manuals,
+
+	http://developer.intel.com/design/pentium4/manuals
+
+"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001,
+order number 248966.  Available on-line:
+
+	http://developer.intel.com/design/pentium4/manuals/248966.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/pentium4/copyd.asm b/third_party/gmp/mpn/x86/pentium4/copyd.asm
new file mode 100644
index 0000000..82af81c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/copyd.asm

@@ -0,0 +1,71 @@
+dnl  Pentium-4 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The std/rep/movsl/cld is very slow for small blocks on pentium4.  Its
+dnl  startup time seems to be about 165 cycles.  It then needs 2.6 c/l.
+dnl  We therefore use an open-coded 2 c/l copying loop.
+
+dnl  Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some
+dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
+dnl  speeds, at least for large blocks.
+
+include(`../config.m4')
+
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	movl	%ebx, PARAM_SIZE
+	addl	$-1, %ecx
+	js	L(end)
+
+L(loop):
+	movl	(%eax,%ecx,4), %ebx
+	movl	%ebx, (%edx,%ecx,4)
+	addl	$-1, %ecx
+
+	jns	L(loop)
+L(end):
+	movl	PARAM_SIZE, %ebx
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/copyi.asm b/third_party/gmp/mpn/x86/pentium4/copyi.asm
new file mode 100644
index 0000000..b614887
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/copyi.asm

@@ -0,0 +1,93 @@
+dnl  Pentium-4 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The rep/movsl is very slow for small blocks on pentium4.  Its startup
+dnl  time seems to be about 110 cycles.  It then copies at a rate of one
+dnl  limb per cycle.  We therefore fall back to an open-coded 2 c/l copying
+dnl  loop for smaller sizes.
+
+dnl  Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some
+dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
+dnl  speeds, at least for large blocks.
+
+include(`../config.m4')
+
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	cmpl	$150, %ecx
+	jg	L(replmovs)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	movl	%ebx, PARAM_SIZE
+	testl	%ecx, %ecx
+	jz	L(end)
+
+L(loop):
+	movl	(%eax), %ebx
+	leal	4(%eax), %eax
+	addl	$-1, %ecx
+	movl	%ebx, (%edx)
+	leal	4(%edx), %edx
+
+	jnz	L(loop)
+
+L(end):
+	movl	PARAM_SIZE, %ebx
+	ret
+
+L(replmovs):
+	cld	C better safe than sorry, see mpn/x86/README
+
+	movl	%esi, %eax
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+	movl	PARAM_DST, %edi
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/mmx/lshift.asm b/third_party/gmp/mpn/x86/pentium4/mmx/lshift.asm
new file mode 100644
index 0000000..b5eca66
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/mmx/lshift.asm

@@ -0,0 +1,39 @@
+dnl  Intel Pentium-4 mpn_lshift -- left shift.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4 Willamette, Northwood: 1.75 cycles/limb
+C P4 Prescott:		    2.0 cycles/limb
+
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')

diff --git a/third_party/gmp/mpn/x86/pentium4/mmx/popham.asm b/third_party/gmp/mpn/x86/pentium4/mmx/popham.asm
new file mode 100644
index 0000000..9563cb5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/mmx/popham.asm

@@ -0,0 +1,203 @@
+dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			     popcount	     hamdist
+C P3 model 9  (Banias)		?		?
+C P3 model 13 (Dothan)		6		6
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	8		9
+C P4 model 3  (Prescott)	8		9
+C P4 model 4  (Nocona)
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
+C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
+C and using them saves fiddling about with alignment testing on entry.
+C
+C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
+C might be possible, but 8 c/l relying on out-of-order execution is already
+C quite reasonable.
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2,  8)
+defframe(PARAM_SRC,   4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,  8)
+defframe(PARAM_SRC,   4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+	RODATA
+	ALIGN(8)
+L(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+L(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+L(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %edx
+	movd	%edx, %mm7
+	punpckldq %mm7, %mm7
+
+	movl	$0x33333333, %edx
+	movd	%edx, %mm6
+	punpckldq %mm6, %mm6
+
+	movl	$0x0F0F0F0F, %edx
+	movd	%edx, %mm5
+	punpckldq %mm5, %mm5
+
+HAM(`	movl	PARAM_SRC2, %edx')
+
+',`
+	dnl non-PIC
+HAM(`	movl	PARAM_SRC2, %edx')
+	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	L(rodata_3333333333333333), %mm6
+	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+
+	pxor	%mm4, %mm4		C zero
+	pxor	%mm0, %mm0		C total
+
+	subl	$1, %ecx
+	ja	L(top)
+
+L(last):
+	movd	(%eax,%ecx,4), %mm1		C src high limb
+HAM(`	movd	(%edx,%ecx,4), %mm2
+	pxor	%mm2, %mm1
+')
+	jmp	L(loaded)
+
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, size-1 to 2 or 1, inclusive
+	C edx	[hamdist] src2
+	C
+	C mm0	total (low dword)
+	C mm1	(scratch)
+	C mm2	(scratch)
+	C mm3
+	C mm4	0x0000000000000000
+	C mm5	0x0F0F0F0F0F0F0F0F
+	C mm6	0x3333333333333333
+	C mm7	0xAAAAAAAAAAAAAAAA
+
+	movd	(%eax), %mm1
+	movd	4(%eax), %mm2
+	punpckldq %mm2, %mm1
+	addl	$8, %eax
+
+HAM(`	movd	(%edx), %mm2
+	movd	4(%edx), %mm3
+	punpckldq %mm3, %mm2
+	pxor	%mm2, %mm1
+	addl	$8, %edx
+')
+
+L(loaded):
+	movq	%mm7, %mm2
+	pand	%mm1, %mm2
+	psrlq	$1, %mm2
+	psubd	%mm2, %mm1	C bit pairs
+
+	movq	%mm6, %mm2
+	pand	%mm1, %mm2
+	psrlq	$2, %mm1
+	pand	%mm6, %mm1
+	paddd	%mm2, %mm1	C nibbles
+
+	movq	%mm5, %mm2
+	pand	%mm1, %mm2
+	psrlq	$4, %mm1
+	pand	%mm5, %mm1
+	paddd	%mm2, %mm1	C bytes
+
+	psadbw(	%mm4, %mm1)
+	paddd	%mm1, %mm0	C to total
+
+	subl	$2, %ecx
+	jg	L(top)
+
+	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
+	jz	L(last)
+
+
+	movd	%mm0, %eax
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/mmx/rshift.asm b/third_party/gmp/mpn/x86/pentium4/mmx/rshift.asm
new file mode 100644
index 0000000..3ac0094
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/mmx/rshift.asm

@@ -0,0 +1,39 @@
+dnl  Intel Pentium-4 mpn_rshift -- right shift.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4 Willamette, Northwood: 1.75 cycles/limb
+C P4 Prescott:		    2.0 cycles/limb
+
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/add_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/add_n.asm
new file mode 100644
index 0000000..8e2380e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/add_n.asm

@@ -0,0 +1,101 @@
+dnl  Intel Pentium-4 mpn_add_n -- mpn addition.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C					cycles/limb
+C			     dst!=src1,2  dst==src1  dst==src2
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4	     6		6
+C P4 model 3-4 (Prescott)	4.25	     7.5	7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_add_nc)
+deflit(`FRAME',0)
+	movd	PARAM_CARRY, %mm0
+	jmp	L(start_nc)
+EPILOGUE()
+
+	ALIGN(8)
+PROLOGUE(mpn_add_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+L(start_nc):
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):
+	C eax	src1 end
+	C ebx	src2 end
+	C ecx	counter, limbs, negative
+	C edx	dst end
+	C mm0	carry bit
+
+	movd	(%eax,%ecx,4), %mm1
+	movd	(%ebx,%ecx,4), %mm2
+	paddq	%mm2, %mm1
+
+	paddq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$32, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
new file mode 100644
index 0000000..93b63b2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm

@@ -0,0 +1,108 @@
+dnl  Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
+
+dnl  Copyright 2001-2004, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C					cycles/limb
+C			     dst!=src1,2  dst==src1  dst==src2
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4.25	     6		6
+C P4 model 3-4 (Prescott)	5	     8.5	8.5
+
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best.  Not sure why, but %ecx,4 with src1 and/or
+C src2 is a slowdown.
+C
+C The dependent chain is simply the paddq of x+2*y to the previous carry,
+C then psrlq to get the new carry.  That makes 4 c/l the target speed, which
+C is almost achieved for separate src/dst but when src==dst the write
+C combining anomalies slow it down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+
+	mov	PARAM_SRC2, %ebx
+	pxor	%mm0, %mm0		C initial carry
+
+	mov	PARAM_DST, %edx
+
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):
+	C eax	src1 end
+	C ebx	src2 end
+	C ecx	counter, limbs, negative
+	C edx	dst end
+	C mm0	carry
+
+	movd	(%ebx), %mm2
+	movd	(%eax), %mm1
+	psrlq	$32, %mm0
+	lea	4(%eax), %eax
+	lea	4(%ebx), %ebx
+
+	psllq	$1, %mm2
+	paddq	%mm2, %mm1
+
+	paddq	%mm1, %mm0
+
+	movd	%mm0, (%edx,%ecx,4)
+	add	$1, %ecx
+	jnz	L(top)
+
+
+	psrlq	$32, %mm0
+	mov	SAVE_EBX, %ebx
+	movd	%mm0, %eax
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/addmul_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
new file mode 100644
index 0000000..7810207
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/addmul_1.asm

@@ -0,0 +1,189 @@
+dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		5.24
+C P6 model 13  (Dothan)		5.24
+C P4 model 0-1 (Willamette)	5
+C P4 model 2   (Northwood)	5
+C P4 model 3-4 (Prescott)	5
+
+C TODO:
+C  * Tweak eax/edx offsets in loop as to save some lea's
+C  * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C n		sp + 12
+C v0		sp + 16
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addmul_1)
+	pxor	%mm6, %mm6
+L(ent):	mov	4(%esp), %edx
+	mov	8(%esp), %eax
+	mov	12(%esp), %ecx
+	movd	16(%esp), %mm7
+	cmp	$4, %ecx
+	jnc	L(big)
+
+L(lp0):	movd	(%eax), %mm0
+	lea	4(%eax), %eax
+	movd	(%edx), %mm4
+	lea	4(%edx), %edx
+	pmuludq	%mm7, %mm0
+	paddq	%mm0, %mm4
+	paddq	%mm4, %mm6
+	movd	%mm6, -4(%edx)
+	psrlq	$32, %mm6
+	dec	%ecx
+	jnz	L(lp0)
+	movd	%mm6, %eax
+	emms
+	ret
+
+L(big):	and	$3, %ecx
+	je	L(0)
+	cmp	$2, %ecx
+	jc	L(1)
+	je	L(2)
+	jmp	L(3)			C FIXME: one case should fall through
+
+L(0):	movd	(%eax), %mm3
+	sub	12(%esp), %ecx		C loop count
+	lea	-16(%eax), %eax
+	lea	-12(%edx), %edx
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	movd	12(%edx), %mm5
+	pmuludq	%mm7, %mm0
+	movd	24(%eax), %mm1
+	paddq	%mm3, %mm5
+	movd	16(%edx), %mm4
+	jmp	L(00)
+
+L(1):	movd	(%eax), %mm2
+	sub	12(%esp), %ecx
+	lea	-12(%eax), %eax
+	lea	-8(%edx), %edx
+	movd	8(%edx), %mm4
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	paddq	%mm2, %mm4
+	movd	12(%edx), %mm5
+	jmp	L(01)
+
+L(2):	movd	(%eax), %mm1
+	sub	12(%esp), %ecx
+	lea	-8(%eax), %eax
+	lea	-4(%edx), %edx
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+	movd	4(%edx), %mm5
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	paddq	%mm1, %mm5
+	movd	8(%edx), %mm4
+	jmp	L(10)
+
+L(3):	movd	(%eax), %mm0
+	sub	12(%esp), %ecx
+	lea	-4(%eax), %eax
+	pmuludq	%mm7, %mm0
+	movd	8(%eax), %mm1
+	movd	(%edx), %mm4
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+	paddq	%mm0, %mm4
+	movd	4(%edx), %mm5
+
+	ALIGN(16)
+L(top):	pmuludq	%mm7, %mm2
+	paddq	%mm4, %mm6
+	movd	16(%eax), %mm3
+	paddq	%mm1, %mm5
+	movd	8(%edx), %mm4
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+L(10):	pmuludq	%mm7, %mm3
+	paddq	%mm5, %mm6
+	movd	20(%eax), %mm0
+	paddq	%mm2, %mm4
+	movd	12(%edx), %mm5
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+L(01):	pmuludq	%mm7, %mm0
+	paddq	%mm4, %mm6
+	movd	24(%eax), %mm1
+	paddq	%mm3, %mm5
+	movd	16(%edx), %mm4
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+L(00):	pmuludq	%mm7, %mm1
+	paddq	%mm5, %mm6
+	movd	28(%eax), %mm2
+	paddq	%mm0, %mm4
+	movd	20(%edx), %mm5
+	movd	%mm6, 12(%edx)
+	psrlq	$32, %mm6
+	lea	16(%eax), %eax
+	lea	16(%edx), %edx
+	add	$4, %ecx
+	jnz	L(top)
+
+L(end):	pmuludq	%mm7, %mm2
+	paddq	%mm4, %mm6
+	paddq	%mm1, %mm5
+	movd	8(%edx), %mm4
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm5, %mm6
+	paddq	%mm2, %mm4
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm4, %mm6
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+	movd	%mm6, %eax
+	emms
+	ret
+EPILOGUE()
+PROLOGUE(mpn_addmul_1c)
+	movd	20(%esp), %mm6
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000..354300e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm

@@ -0,0 +1,141 @@
+dnl  Intel Atom  mpn_bdiv_dbm1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C			    cycles/limb
+C P5				 -
+C P6 model 0-8,10-12		 -
+C P6 model 9  (Banias)		 9.75
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	 8.25
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 8
+C AMD K6			 -
+C AMD K7			 -
+C AMD K8
+C AMD K10
+
+C TODO: This code was optimised for atom-32, consider moving it back to atom
+C	dir(atom currently grabs this code), and write a 4-way version(7c/l).
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL,  16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_RP,`PARAM_MUL')
+define(SAVE_UP,`PARAM_SIZE')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n',  `%ecx')
+define(`reg', `%edx')
+define(`cy', `%eax')	C contains the return value
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_bdiv_dbm1c)
+	mov	PARAM_SIZE, n		C size
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	movd	PARAM_MUL, %mm7
+	mov	rp, SAVE_RP
+	mov	PARAM_DST, rp
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	shr	n
+	mov	PARAM_CARRY, cy
+	jz	L(eq1)
+
+	movd	4(up), %mm1
+	jc	L(odd)
+
+	lea	4(up), up
+	pmuludq	%mm7, %mm1
+	movd	%mm0, reg
+	psrlq	$32, %mm0
+	sub	reg, cy
+	movd	%mm0, reg
+	movq	%mm1, %mm0
+	dec	n
+	mov	cy, (rp)
+	lea	4(rp), rp
+	jz	L(end)
+
+C	ALIGN(16)
+L(top):	movd	4(up), %mm1
+	sbb	reg, cy
+L(odd):	movd	%mm0, reg
+	psrlq	$32, %mm0
+	pmuludq	%mm7, %mm1
+	sub	reg, cy
+	lea	8(up), up
+	movd	%mm0, reg
+	movd	(up), %mm0
+	mov	cy, (rp)
+	sbb	reg, cy
+	movd	%mm1, reg
+	psrlq	$32, %mm1
+	sub	reg, cy
+	movd	%mm1, reg
+	pmuludq	%mm7, %mm0
+	dec	n
+	mov	cy, 4(rp)
+	lea	8(rp), rp
+	jnz	L(top)
+
+L(end):	sbb	reg, cy
+
+L(eq1):	movd	%mm0, reg
+	psrlq	$32, %mm0
+	mov	SAVE_UP, up
+	sub	reg, cy
+	movd	%mm0, reg
+	emms
+	mov	cy, (rp)
+	sbb	reg, cy
+
+	mov	SAVE_RP, rp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm
new file mode 100644
index 0000000..d5008f4
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm

@@ -0,0 +1,234 @@
+dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	movl	PARAM_DIVISOR, %ecx
+
+	movd	%ecx, %mm6
+	movl	PARAM_SHIFT, %ecx
+
+	movd	%ecx, %mm7		C shift
+
+	C
+
+	movl	PARAM_INVERSE, %ecx
+	movd	%ecx, %mm5		C inv
+
+	movl	PARAM_DST, %ecx
+	pxor	%mm1, %mm1		C initial carry limb
+	pxor	%mm0, %mm0		C initial carry bit
+
+	subl	$1, %edx
+	jz	L(done)
+
+	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+
+C The dependent chain here is as follows.
+C
+C					latency
+C	psubq	 s = (src-cbit) - climb	   2
+C	pmuludq	 q = s*inverse		   8
+C	pmuludq	 prod = q*divisor	   8
+C	psrlq	 climb = high(prod)	   2
+C					  --
+C					  20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+	C eax	src, incrementing
+	C ebx
+	C ecx	dst, incrementing
+	C edx	counter, size-1 iterations
+	C
+	C mm0	carry bit
+	C mm1	carry limb
+	C mm4	0x00000000FFFFFFFF
+	C mm5	inverse
+	C mm6	divisor
+	C mm7	shift
+
+	movd	(%eax), %mm2
+	movd	4(%eax), %mm3
+	addl	$4, %eax
+	punpckldq %mm3, %mm2
+
+	psrlq	%mm7, %mm2
+	pand	%mm4, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+	movq	%mm2, %mm0
+	psrlq	$63, %mm0		C new cbit
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+	addl	$4, %ecx
+
+	movq	%mm6, %mm1
+	pmuludq	%mm2, %mm1		C q*divisor
+	psrlq	$32, %mm1		C new climb
+
+L(entry):
+	subl	$1, %edx
+	jnz	L(top)
+
+L(done):
+	movd	(%eax), %mm2
+	psrlq	%mm7, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+
+	emms
+	ret
+
+EPILOGUE()
+
+	ALIGN(16)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_DIVISOR, %ecx
+
+	C eax	src
+	C ebx
+	C ecx	divisor
+	C edx	size-1
+
+	movl	%ecx, %eax
+	bsfl	%ecx, %ecx		C trailing twos
+
+	shrl	%cl, %eax		C d = divisor without twos
+	movd	%eax, %mm6
+	movd	%ecx, %mm7		C shift
+
+	shrl	%eax			C d/2
+
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ecx)
+	movzbl	(%eax,%ecx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	C
+
+	movd	%eax, %mm5		C inv
+
+	movd	%eax, %mm0		C inv
+
+	pmuludq	%mm5, %mm5		C inv*inv
+
+	C
+
+	pmuludq	%mm6, %mm5		C inv*inv*d
+	paddd	%mm0, %mm0		C 2*inv
+
+	C
+
+	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
+	pxor	%mm5, %mm5
+
+	paddd	%mm0, %mm5
+	pmuludq	%mm0, %mm0		C inv*inv
+
+	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+
+	C
+
+	pmuludq	%mm6, %mm0		C inv*inv*d
+	paddd	%mm5, %mm5		C 2*inv
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %ecx
+	pxor	%mm1, %mm1		C initial carry limb
+
+	C
+
+	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	movq	%mm6, %mm0
+	pmuludq	%mm5, %mm0
+	movd	%mm0, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	pxor	%mm0, %mm0		C initial carry bit
+	jmp	L(entry)
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm
new file mode 100644
index 0000000..b3f3474
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm

@@ -0,0 +1,95 @@
+dnl  Intel Pentium-4 mpn_cnd_add_n -- mpn addition.
+
+dnl  Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P6 model 0-8,10-12		 -
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 4.67
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 5
+C P4 model 3-4 (Prescott)	 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST,  8)
+defframe(PARAM_CND,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+	TEXT
+	ALIGN(8)
+
+	ALIGN(8)
+PROLOGUE(mpn_cnd_add_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+
+	mov	PARAM_CND, %eax
+	neg	%eax
+	sbb	%eax, %eax
+	movd	%eax, cnd
+
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):	movd	(%ebx,%ecx,4), %mm2
+	movd	(%eax,%ecx,4), %mm1
+	pand	cnd, %mm2
+	paddq	%mm2, %mm1
+
+	paddq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$32, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm
new file mode 100644
index 0000000..339a23e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm

@@ -0,0 +1,114 @@
+dnl  Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction.
+
+dnl  Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P6 model 0-8,10-12		 -
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 4.67
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 5
+C P4 model 3-4 (Prescott)	 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST,  8)
+defframe(PARAM_CND,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+	TEXT
+	ALIGN(8)
+
+	ALIGN(8)
+PROLOGUE(mpn_cnd_sub_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+
+	mov	PARAM_CND, %eax
+	neg	%eax
+	sbb	%eax, %eax
+	movd	%eax, cnd
+
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):	movd	(%ebx,%ecx,4), %mm2
+	movd	(%eax,%ecx,4), %mm1
+	pand	cnd, %mm2
+	psubq	%mm2, %mm1
+
+	psubq	%mm0, %mm1
+	movd	%mm1, (%edx,%ecx,4)
+
+	psrlq	$63, %mm1
+
+	add	$1, %ecx
+	jz	L(done_mm1)
+
+	movd	(%ebx,%ecx,4), %mm2
+	movd	(%eax,%ecx,4), %mm0
+	pand	cnd, %mm2
+	psubq	%mm2, %mm0
+
+	psubq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$63, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+L(done_mm1):
+	movd	%mm1, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/dive_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/dive_1.asm
new file mode 100644
index 0000000..0ceef5b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/dive_1.asm

@@ -0,0 +1,216 @@
+dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
+C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
+C that might be hidden by out-of-order execution, whereas divl is around 60.
+C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
+C faster.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	movl	PARAM_DIVISOR, %ecx
+	subl	$1, %edx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+	xorl	%edx, %edx
+
+	divl	%ecx
+	movl	PARAM_DST, %ecx
+
+	movl	%eax, (%ecx)
+	ret
+
+
+L(two_or_more):
+	C eax	src
+	C ebx
+	C ecx	divisor
+	C edx	size-1
+
+	movl	%ecx, %eax
+	bsfl	%ecx, %ecx		C trailing twos
+
+	shrl	%cl, %eax		C d = divisor without twos
+	movd	%eax, %mm6
+	movd	%ecx, %mm7		C shift
+
+	shrl	%eax			C d/2
+
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ecx)
+	movzbl	(%eax,%ecx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	C
+
+	movd	%eax, %mm5		C inv
+
+	movd	%eax, %mm0		C inv
+
+	pmuludq	%mm5, %mm5		C inv*inv
+
+	C
+
+	pmuludq	%mm6, %mm5		C inv*inv*d
+	paddd	%mm0, %mm0		C 2*inv
+
+	C
+
+	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
+	pxor	%mm5, %mm5
+
+	paddd	%mm0, %mm5
+	pmuludq	%mm0, %mm0		C inv*inv
+
+	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+
+	C
+
+	pmuludq	%mm6, %mm0		C inv*inv*d
+	paddd	%mm5, %mm5		C 2*inv
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %ecx
+	pxor	%mm1, %mm1		C initial carry limb
+
+	C
+
+	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	movq	%mm6, %mm0
+	pmuludq	%mm5, %mm0
+	movd	%mm0, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	pxor	%mm0, %mm0		C initial carry bit
+
+
+C The dependent chain here is as follows.
+C
+C					latency
+C	psubq	 s = (src-cbit) - climb	   2
+C	pmuludq	 q = s*inverse		   8
+C	pmuludq	 prod = q*divisor	   8
+C	psrlq	 climb = high(prod)	   2
+C					  --
+C					  20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+	C eax	src, incrementing
+	C ebx
+	C ecx	dst, incrementing
+	C edx	counter, size-1 iterations
+	C
+	C mm0	carry bit
+	C mm1	carry limb
+	C mm4	0x00000000FFFFFFFF
+	C mm5	inverse
+	C mm6	divisor
+	C mm7	shift
+
+	movd	(%eax), %mm2
+	movd	4(%eax), %mm3
+	addl	$4, %eax
+	punpckldq %mm3, %mm2
+
+	psrlq	%mm7, %mm2
+	pand	%mm4, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+	movq	%mm2, %mm0
+	psrlq	$63, %mm0		C new cbit
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+	addl	$4, %ecx
+
+	movq	%mm6, %mm1
+	pmuludq	%mm2, %mm1		C q*divisor
+	psrlq	$32, %mm1		C new climb
+
+	subl	$1, %edx
+	jnz	L(top)
+
+
+L(done):
+	movd	(%eax), %mm2
+	psrlq	%mm7, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+
+	emms
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/divrem_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
new file mode 100644
index 0000000..0146fab
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/divrem_1.asm

@@ -0,0 +1,645 @@
+dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C "m" is written for what is m' in the paper, and "d" for d_norm, which
+C won't cause any confusion since it's only the normalized divisor that's of
+C any use in the code.  "b" is written for 2^N, the size of a limb, N being
+C 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
+C us have just a psubq on the dependent chain.
+C
+C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
+C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is stored.  The
+C latter is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Enhancements:
+C
+C The loop measures 32 cycles, but the dependent chain would suggest it
+C could be done with 30.  Not sure where to start looking for the extras.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case.  It doesn't seem worth checking for this though, since it
+C probably occurs infrequently.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 80-90 cycles to calculate, but after that the
+dnl  multiply is 32 c/l versus division at about 58 c/l.
+dnl
+dnl  At 4 limbs the div is a touch faster than the mul (and of course
+dnl  simpler), so start the mul from 5 limbs.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+define(SAVE_EDI,`PARAM_DIVISOR')
+define(SAVE_EBX,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	xorl	%edx, %edx		C carry if can't skip a div
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movd	PARAM_PREINV_INVERSE, %mm4
+
+	movd	PARAM_PREINV_SHIFT, %mm7  C l
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+	movd	%edx, %mm0		C carry
+
+	movd	%edx, %mm1		C carry
+	movl	$0, %edx
+
+	movd	%ebp, %mm5		C d
+	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	movl	%edx, (%edi,%ecx,4)	C dst high limb
+	sbbl	$0, %ecx		C skip one division if high<divisor
+	movl	$32, %eax
+
+	subl	PARAM_PREINV_SHIFT, %eax
+	psllq	%mm7, %mm5		C d normalized
+	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
+	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
+
+	movd	%eax, %mm6		C 32-l
+	jmp	L(start_preinv)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	xorl	%edx, %edx		C initial carry (if can't skip a div)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	orl	%ecx, %ecx		C size
+	jz	L(no_skip_div)		C if size==0
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
+	movl	%edx, (%edi,%ecx,4)	C dst high limb
+
+	movl	$0, %edx
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+
+	sbbl	$0, %ecx		C size-1 if high<divisor
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
+	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]
+
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)	C if size==0
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	carry
+	C esi	src, decrementing
+	C edi	dst, decrementing
+	C ebp	divisor
+
+	movl	(%esi), %eax
+	subl	$4, %esi
+
+	divl	%ebp
+
+	movl	%eax, (%edi)
+	subl	$4, %edi
+
+	subl	$1, %ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)	C if xsize!=0
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	movl	%edx, %eax
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	carry
+	C esi
+	C edi	dst, decrementing
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi)
+	subl	$4, %edi
+
+	subl	$1, %ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	&src[size-1]
+	C edi	&dst[size+xsize-1]
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+	movd	%edx, %mm0		C carry
+	movd	%edx, %mm1		C carry
+	movl	%ecx, %edx		C size
+	movl	$31, %ecx
+
+	C
+
+	xorl	%eax, %ecx		C l = leading zeros on d
+	addl	$1, %eax
+
+	shll	%cl, %ebp		C d normalized
+	movd	%ecx, %mm7		C l
+	movl	%edx, %ecx		C size
+
+	movd	%eax, %mm6		C 32-l
+	movl	$-1, %edx
+	movl	$-1, %eax
+
+	C
+
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1 / d)
+	movd	%ebp, %mm5		C d
+
+	C
+
+	movd	%eax, %mm4		C m
+
+
+L(start_preinv):
+	C eax	inverse
+	C ebx	xsize
+	C ecx	size
+	C edx
+	C esi	&src[size-1]
+	C edi	&dst[size+xsize-1]
+	C ebp
+	C
+	C mm0	carry
+	C mm1	carry
+	C mm2
+	C mm4	m
+	C mm5	d
+	C mm6	31-l
+	C mm7	l
+
+	psllq	%mm7, %mm0		C n2 = carry << l, for size==0
+
+	subl	$1, %ecx
+	jb	L(integer_none)
+
+	movd	(%esi), %mm0		C src high limb
+	punpckldq %mm1, %mm0
+	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
+	jz	L(integer_last)
+
+
+C The dependent chain here consists of
+C
+C	2   paddd    n1+n2
+C	8   pmuludq  m*(n1+n2)
+C	2   paddq    n2:nadj + m*(n1+n2)
+C	2   psrlq    q1
+C	8   pmuludq  d*q1
+C	2   psubq    (n-d)-q1*d
+C	2   psrlq    high n-(q1+1)*d mask
+C	2   pand     d masked
+C	2   paddd    n2+d addback
+C	--
+C	30
+C
+C But it seems to run at 32 cycles, so presumably there's something else
+C going on.
+
+	ALIGN(16)
+L(integer_top):
+	C eax
+	C ebx
+	C ecx	counter, size-1 to 0
+	C edx
+	C esi	src, decrementing
+	C edi	dst, decrementing
+	C
+	C mm0	n2
+	C mm4	m
+	C mm5	d
+	C mm6	32-l
+	C mm7	l
+
+	ASSERT(b,`C n2<d
+	 movd	%mm0, %eax
+	 movd	%mm5, %edx
+	 cmpl	%edx, %eax')
+
+	movd	-4(%esi), %mm1		C next src limbs
+	movd	(%esi), %mm2
+	leal	-4(%esi), %esi
+
+	punpckldq %mm2, %mm1
+	psrlq	%mm6, %mm1		C n10
+
+	movq	%mm1, %mm2		C n10
+	movq	%mm1, %mm3		C n10
+	psrad	$31, %mm1		C -n1
+	pand	%mm5, %mm1		C -n1 & d
+	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
+
+	psrld	$31, %mm2		C n1
+	paddd	%mm0, %mm2		C n2+n1
+	punpckldq %mm0, %mm1		C n2:nadj
+
+	pmuludq	%mm4, %mm2		C m*(n2+n1)
+
+	C
+
+	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
+	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
+	pcmpeqd	%mm2, %mm2		C FF...FF
+	psrlq	$63, %mm2		C 1
+
+	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
+
+	paddd	%mm1, %mm2		C q1+1
+	pmuludq	%mm5, %mm1		C q1*d
+
+	punpckldq %mm0, %mm3		C n = n2:n10
+	pxor	%mm0, %mm0
+
+	psubq	%mm5, %mm3		C n - d
+
+	C
+
+	psubq	%mm1, %mm3		C n - (q1+1)*d
+
+	por	%mm3, %mm0		C copy remainder -> new n2
+	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
+
+	ASSERT(be,`C 0 or -1
+	 movd	%mm3, %eax
+	 addl	$1, %eax
+	 cmpl	$1, %eax')
+
+	paddd	%mm3, %mm2		C q
+	pand	%mm5, %mm3		C mask & d
+
+	paddd	%mm3, %mm0		C addback if necessary
+	movd	%mm2, (%edi)
+	leal	-4(%edi), %edi
+
+	subl	$1, %ecx
+	ja	L(integer_top)
+
+
+L(integer_last):
+	C eax
+	C ebx	xsize
+	C ecx
+	C edx
+	C esi	&src[0]
+	C edi	&dst[xsize]
+	C
+	C mm0	n2
+	C mm4	m
+	C mm5	d
+	C mm6
+	C mm7	l
+
+	ASSERT(b,`C n2<d
+	 movd	%mm0, %eax
+	 movd	%mm5, %edx
+	 cmpl	%edx, %eax')
+
+	movd	(%esi), %mm1		C src[0]
+	psllq	%mm7, %mm1		C n10
+
+	movq	%mm1, %mm2		C n10
+	movq	%mm1, %mm3		C n10
+	psrad	$31, %mm1		C -n1
+	pand	%mm5, %mm1		C -n1 & d
+	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
+
+	psrld	$31, %mm2		C n1
+	paddd	%mm0, %mm2		C n2+n1
+	punpckldq %mm0, %mm1		C n2:nadj
+
+	pmuludq	%mm4, %mm2		C m*(n2+n1)
+
+	C
+
+	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
+	pcmpeqd	%mm2, %mm2		C FF...FF
+	psrlq	$63, %mm2		C 1
+
+	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
+	paddd	%mm1, %mm2		C q1
+
+	pmuludq	%mm5, %mm1		C q1*d
+	punpckldq %mm0, %mm3		C n
+	psubq	%mm5, %mm3		C n - d
+	pxor	%mm0, %mm0
+
+	C
+
+	psubq	%mm1, %mm3		C n - (q1+1)*d
+
+	por	%mm3, %mm0		C remainder -> n2
+	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
+
+	ASSERT(be,`C 0 or -1
+	 movd	%mm3, %eax
+	 addl	$1, %eax
+	 cmpl	$1, %eax')
+
+	paddd	%mm3, %mm2		C q
+	pand	%mm5, %mm3		C mask & d
+
+	paddd	%mm3, %mm0		C addback if necessary
+	movd	%mm2, (%edi)
+	leal	-4(%edi), %edi
+
+
+L(integer_none):
+	C eax
+	C ebx	xsize
+
+	orl	%ebx, %ebx
+	jnz	L(fraction_some)	C if xsize!=0
+
+
+L(fraction_done):
+	movl	SAVE_EBP, %ebp
+	psrld	%mm7, %mm0		C remainder
+
+	movl	SAVE_EDI, %edi
+	movd	%mm0, %eax
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+
+L(fraction_some):
+	C eax
+	C ebx	xsize
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[xsize-1]
+	C ebp
+
+
+L(fraction_top):
+	C eax
+	C ebx	counter, xsize iterations
+	C ecx
+	C edx
+	C esi	src, decrementing
+	C edi	dst, decrementing
+	C
+	C mm0	n2
+	C mm4	m
+	C mm5	d
+	C mm6	32-l
+	C mm7	l
+
+	ASSERT(b,`C n2<d
+	 movd	%mm0, %eax
+	 movd	%mm5, %edx
+	 cmpl	%edx, %eax')
+
+	movq	%mm0, %mm1		C n2
+	pmuludq	%mm4, %mm0		C m*n2
+
+	pcmpeqd	%mm2, %mm2
+	psrlq	$63, %mm2
+
+	C
+
+	psrlq	$32, %mm0		C high(m*n2)
+
+	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)
+
+	paddd	%mm0, %mm2		C q1+1
+	pmuludq	%mm5, %mm0		C q1*d
+
+	psllq	$32, %mm1		C n = n2:0
+	psubq	%mm5, %mm1		C n - d
+
+	C
+
+	psubq	%mm0, %mm1		C r = n - (q1+1)*d
+	pxor	%mm0, %mm0
+
+	por	%mm1, %mm0		C r -> n2
+	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1
+
+	ASSERT(be,`C 0 or -1
+	 movd	%mm1, %eax
+	 addl	$1, %eax
+	 cmpl	$1, %eax')
+
+	paddd	%mm1, %mm2		C q
+	pand	%mm5, %mm1		C mask & d
+
+	paddd	%mm1, %mm0		C addback if necessary
+	movd	%mm2, (%edi)
+	leal	-4(%edi), %edi
+
+	subl	$1, %ebx
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h b/third_party/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a047a51
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h

@@ -0,0 +1,213 @@
+/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2600 MHz P4 Northwood */
+/* FFT tuning limit = 23,700,309 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.2 */
+
+#define MOD_1_NORM_THRESHOLD                 5
+#define MOD_1_UNNORM_THRESHOLD              14
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 2  /* 4.36% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD             16
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define DIV_1_VS_MUL_1_PERCENT             358
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD               101
+#define MUL_TOOM44_THRESHOLD               284
+#define MUL_TOOM6H_THRESHOLD               406
+#define MUL_TOOM8H_THRESHOLD               592
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     101
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     191
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     189
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     195
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     151
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 51
+#define SQR_TOOM3_THRESHOLD                163
+#define SQR_TOOM4_THRESHOLD                254
+#define SQR_TOOM6_THRESHOLD                614
+#define SQR_TOOM8_THRESHOLD                842
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             824  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    824, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     36, 6}, {     19, 5}, {     39, 6}, \
+    {     29, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     48, 7}, {     29, 8}, \
+    {     15, 7}, {     37, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     99, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    143,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    271,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335,11}, {    191,10}, {    383, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    527,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    863,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1119, 9}, {   2239,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471, 9}, {   2943,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    927,10}, {   1855,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1119,12}, {    575,11}, \
+    {   1215,10}, {   2431,11}, {   1247,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,10}, {   2943,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,10}, {   3455,12}, {    895,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,10}, {   4479,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1343,11}, \
+    {   2687,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2495,11}, {   4991,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1535,12}, {   3135,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3967,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,12}, \
+    {   4991,14}, {   1279,13}, {   2687,12}, {   5503,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 167
+#define MUL_FFT_THRESHOLD                 7808
+
+#define SQR_FFT_MODF_THRESHOLD             560  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    560, 5}, {     33, 6}, {     17, 5}, {     35, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {    111,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127, 9}, {    511, 8}, {   1023, 9}, \
+    {    527,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351,11}, {    191,10}, {    431,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607, 9}, {   1215,11}, {    319,10}, {    639,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1119,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    927,10}, {   1855,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1055,10}, {   2111,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    895,11}, {   1855,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2239,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,11}, {   3455,13}, \
+    {    895,12}, {   1983,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2495,11}, {   4991,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,12}, \
+    {   3839,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,12}, {   4991,14}, {   1279,13}, \
+    {   2687,12}, {   5503,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 149
+#define SQR_FFT_THRESHOLD                 4800
+
+#define MULLO_BASECASE_THRESHOLD            12
+#define MULLO_DC_THRESHOLD                  44
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD            13
+#define SQRLO_DC_THRESHOLD                  42
+#define SQRLO_SQR_THRESHOLD               9449
+
+#define DC_DIV_QR_THRESHOLD                 38
+#define DC_DIVAPPR_Q_THRESHOLD             105
+#define DC_BDIV_QR_THRESHOLD                52
+#define DC_BDIV_Q_THRESHOLD                 83
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               158
+#define INV_APPR_THRESHOLD                 118
+
+#define BINV_NEWTON_THRESHOLD              342
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               2130
+#define MU_DIVAPPR_Q_THRESHOLD            1895
+#define MUPI_DIV_QR_THRESHOLD               60
+#define MU_BDIV_QR_THRESHOLD              1652
+#define MU_BDIV_Q_THRESHOLD               2089
+
+#define POWM_SEC_TABLE  1,22,96,446,723,1378
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               298
+#define SET_STR_PRECOMPUTE_THRESHOLD       960
+
+#define FAC_DSC_THRESHOLD                  212
+#define FAC_ODD_THRESHOLD                   71
+
+#define MATRIX22_STRASSEN_THRESHOLD         26
+#define HGCD2_DIV1_METHOD                    3  /* 0.68% faster than 1 */
+#define HGCD_THRESHOLD                      80
+#define HGCD_APPR_THRESHOLD                138
+#define HGCD_REDUCE_THRESHOLD             4455
+#define GCD_DC_THRESHOLD                   365
+#define GCDEXT_DC_THRESHOLD                245
+#define JACOBI_BASE_METHOD                   4  /* 23.41% faster than 1 */
+
+/* Tuneup completed successfully, took 63807 seconds */

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ee88bab
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm

@@ -0,0 +1,166 @@
+dnl  x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimize.  The present code was written quite straightforwardly.
+C  * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
+C  * Write a cps function that uses sse2 insns.
+
+C                           cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)     16
+C P4 model 3-4 (Prescott)      18
+
+C INPUT PARAMETERS
+C ap		sp + 4
+C n		sp + 8
+C b		sp + 12
+C cps		sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`ap',     `%edx')
+define(`n',      `%eax')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1_1p)
+	push	%ebx
+	mov	8(%esp), ap
+	mov	12(%esp), n
+	mov	20(%esp), %ecx
+	movd	8(%ecx), B1modb
+	movd	12(%ecx), B2modb
+
+	lea	-4(ap,n,4), ap
+
+C FIXME: See comment in generic/mod_1_1.c.
+	movd	(ap), %mm7
+	movd	-4(ap), %mm4
+	pmuludq B1modb, %mm7
+	paddq	%mm4, %mm7
+	add	$-2, n
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	movq	%mm7, %mm6
+	psrlq	$32, %mm7		C rh
+	movd	-8(ap), %mm0
+	add	$-4, ap
+	pmuludq	B2modb, %mm7
+	pmuludq	B1modb, %mm6
+	add	$-1, n
+	paddq	%mm0, %mm7
+	paddq	%mm6, %mm7
+	jnz	L(top)
+
+L(end):	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+	pand	%mm7, %mm4		C rl
+	psrlq	$32, %mm7		C rh
+	pmuludq	B1modb, %mm7		C rh,cl
+	paddq	%mm4, %mm7		C rh,rl
+	movd	4(%ecx), %mm4		C cnt
+	psllq	%mm4, %mm7		C rh,rl normalized
+	movq	%mm7, %mm2		C rl in low half
+	psrlq	$32, %mm7		C rh
+	movd	(%ecx), %mm1		C bi
+	pmuludq	%mm7, %mm1		C qh,ql
+	paddq	%mm2, %mm1		C qh-1,ql
+	movd	%mm1, %ecx		C ql
+	psrlq	$32, %mm1		C qh-1
+	movd	16(%esp), %mm3		C b
+	pmuludq	%mm1, %mm3		C (qh-1) * b
+	psubq	%mm3, %mm2		C r in low half (could use psubd)
+	movd	%mm2, %eax		C r
+	mov	16(%esp), %ebx
+	sub	%ebx, %eax		C r
+	cmp	%eax, %ecx
+	lea	(%eax,%ebx), %edx
+	cmovc(	%edx, %eax)
+	movd	%mm4, %ecx		C cnt
+	cmp	%ebx, %eax
+	jae	L(fix)
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+
+L(fix):	sub	%ebx, %eax
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+C CAUTION: This is the same code as in k7/mod_1_1.asm
+	push	%ebp
+	mov	12(%esp), %ebp
+	push	%esi
+	bsr	%ebp, %ecx
+	push	%ebx
+	xor	$31, %ecx
+	mov	16(%esp), %esi
+	sal	%cl, %ebp
+	mov	%ebp, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebp
+	mov	%eax, (%esi)		C store bi
+	mov	%ecx, 4(%esi)		C store cnt
+	xor	%ebx, %ebx
+	sub	%ebp, %ebx
+	mov	$1, %edx
+	shld	%cl, %eax, %edx
+	imul	%edx, %ebx
+	mul	%ebx
+	add	%ebx, %edx
+	not	%edx
+	imul	%ebp, %edx
+	add	%edx, %ebp
+	cmp	%edx, %eax
+	cmovc(	%ebp, %edx)
+	shr	%cl, %ebx
+	mov	%ebx, 8(%esi)		C store B1modb
+	shr	%cl, %edx
+	mov	%edx, 12(%esi)		C store B2modb
+	pop	%ebx
+	pop	%esi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm
new file mode 100644
index 0000000..eb2edb6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm

@@ -0,0 +1,269 @@
+dnl  x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimize.  The present code was written quite straightforwardly.
+C  * Optimize post-loop reduction code.
+C  * Write a cps function that uses sse2 insns.
+
+C			    cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		3.4
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4
+C P4 model 3-4 (Prescott)	4.5
+
+C INPUT PARAMETERS
+C ap		sp + 4
+C n		sp + 8
+C b		sp + 12
+C cps		sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`B3modb', `%mm3')
+define(`B4modb', `%mm4')
+define(`B5modb', `%mm5')
+define(`ap',     `%edx')
+define(`n',      `%eax')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+	push	%ebx
+	mov	8(%esp), ap
+	mov	12(%esp), n
+	mov	20(%esp), %ecx
+
+	movd	8(%ecx), B1modb
+	movd	12(%ecx), B2modb
+	movd	16(%ecx), B3modb
+	movd	20(%ecx), B4modb
+	movd	24(%ecx), B5modb
+
+	mov	n, %ebx
+	lea	-4(ap,n,4), ap
+	and	$3, %ebx
+	je	L(b0)
+	cmp	$2, %ebx
+	jc	L(b1)
+	je	L(b2)
+
+L(b3):	movd	-4(ap), %mm7
+	pmuludq	B1modb, %mm7
+	movd	-8(ap), %mm6
+	paddq	%mm6, %mm7
+	movd	(ap), %mm6
+	pmuludq	B2modb, %mm6
+	paddq	%mm6, %mm7
+	lea	-24(ap), ap
+	add	$-3, n
+	jz	L(end)
+	jmp	L(top)
+
+L(b0):	movd	-8(ap), %mm7
+	pmuludq	B1modb, %mm7
+	movd	-12(ap), %mm6
+	paddq	%mm6, %mm7
+	movd	-4(ap), %mm6
+	pmuludq	B2modb, %mm6
+	paddq	%mm6, %mm7
+	movd	(ap), %mm6
+	pmuludq	B3modb, %mm6
+	paddq	%mm6, %mm7
+	lea	-28(ap), ap
+	add	$-4, n
+	jz	L(end)
+	jmp	L(top)
+
+L(b1):	movd	(ap), %mm7
+	lea	-16(ap), ap
+	dec	n
+	jz	L(x)
+	jmp	L(top)
+
+L(b2):	movd	-4(ap), %mm7		C rl
+	punpckldq (ap), %mm7		C rh
+	lea	-20(ap), ap
+	add	$-2, n
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	movd	4(ap), %mm0
+	pmuludq	B1modb, %mm0
+	movd	0(ap), %mm6
+	paddq	%mm6, %mm0
+
+	movd	8(ap), %mm6
+	pmuludq	B2modb, %mm6
+	paddq	%mm6, %mm0
+
+	movd	12(ap), %mm6
+	pmuludq	B3modb, %mm6
+	paddq	%mm6, %mm0
+
+	movq	%mm7, %mm6
+	psrlq	$32, %mm7		C rh
+	pmuludq	B5modb, %mm7
+	pmuludq	B4modb, %mm6
+
+	paddq	%mm0, %mm7
+	paddq	%mm6, %mm7
+
+	add	$-16, ap
+	add	$-4, n
+	jnz	L(top)
+
+L(end):	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+	pand	%mm7, %mm4		C rl
+	psrlq	$32, %mm7		C rh
+	pmuludq	B1modb, %mm7		C rh,cl
+	paddq	%mm4, %mm7		C rh,rl
+L(x):	movd	4(%ecx), %mm4		C cnt
+	psllq	%mm4, %mm7		C rh,rl normalized
+	movq	%mm7, %mm2		C rl in low half
+	psrlq	$32, %mm7		C rh
+	movd	(%ecx), %mm1		C bi
+	pmuludq	%mm7, %mm1		C qh,ql
+	paddq	%mm2, %mm1		C qh-1,ql
+	movd	%mm1, %ecx		C ql
+	psrlq	$32, %mm1		C qh-1
+	movd	16(%esp), %mm3		C b
+	pmuludq	%mm1, %mm3		C (qh-1) * b
+	psubq	%mm3, %mm2		C r in low half (could use psubd)
+	movd	%mm2, %eax		C r
+	mov	16(%esp), %ebx
+	sub	%ebx, %eax		C r
+	cmp	%eax, %ecx
+	lea	(%eax,%ebx), %edx
+	cmovc(	%edx, %eax)
+	movd	%mm4, %ecx		C cnt
+	cmp	%ebx, %eax
+	jae	L(fix)
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+
+L(fix):	sub	%ebx, %eax
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in k7/mod_1_4.asm
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
+	mov	24(%esp), %ebx
+	bsr	%ebx, %ecx
+	xor	$31, %ecx
+	sal	%cl, %ebx		C b << cnt
+	mov	%ebx, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebx
+	xor	%edi, %edi
+	sub	%ebx, %edi
+	mov	$1, %esi
+	mov	%eax, (%ebp)		C store bi
+	mov	%ecx, 4(%ebp)		C store cnt
+	shld	%cl, %eax, %esi
+	imul	%edi, %esi
+	mov	%eax, %edi
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 8(%ebp)		C store B1modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 12(%ebp)		C store B2modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 16(%ebp)		C store B3modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 20(%ebp)		C store B4modb
+
+	not	%edx
+	imul	%ebx, %edx
+	add	%edx, %ebx
+	cmp	%edx, %eax
+	cmovnc(	%edx, %ebx)
+
+	shr	%cl, %ebx
+	mov	%ebx, 24(%ebp)		C store B5modb
+
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
new file mode 100644
index 0000000..31e25b7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm

@@ -0,0 +1,175 @@
+dnl  Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C Pentium4: 1.0 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Enhancements:
+C
+C There might a couple of cycles to save by using plain integer code for
+C more small sizes.  2 limbs measures about 20 cycles, but 3 limbs jumps to
+C about 46 (inclusive of some function call overheads).
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SRC')
+define(SAVE_ESI, `PARAM_SIZE')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+	movl	(%edx), %eax
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+	jne	L(one)
+
+	movl	4(%edx), %edx
+	movl	%eax, %ecx
+	shrl	$24, %eax		C src[0] high
+
+	andl	$0x00FFFFFF, %ecx	C src[0] low
+	addl	%ecx, %eax
+
+	movl	%edx, %ecx
+	shll	$8, %edx
+
+	shrl	$16, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+	andl	$0x00FFFF00, %edx	C src[1] high
+	addl	%edx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	pxor	%mm0, %mm0
+	pxor	%mm1, %mm1
+	pxor	%mm2, %mm2
+
+	pcmpeqd	%mm7, %mm7
+	psrlq	$32, %mm7	C 0x00000000FFFFFFFF, low 32 bits
+
+	pcmpeqd	%mm6, %mm6
+	psrlq	$40, %mm6	C 0x0000000000FFFFFF, low 24 bits
+
+L(top):
+	C eax
+	C ebx
+	C ecx	counter, size-2 to 0, -1 or -2
+	C edx	src, incrementing
+	C
+	C mm0	sum 0mod3
+	C mm1	sum 1mod3
+	C mm2	sum 2mod3
+	C mm3
+	C mm4
+	C mm5
+	C mm6	0x0000000000FFFFFF
+	C mm7	0x00000000FFFFFFFF
+
+	movd	(%edx), %mm3
+	paddq	%mm3, %mm0
+
+	movd	4(%edx), %mm3
+	paddq	%mm3, %mm1
+
+	movd	8(%edx), %mm3
+	paddq	%mm3, %mm2
+
+	addl	$12, %edx
+	subl	$3, %ecx
+	ja	L(top)
+
+
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+	addl	$1, %ecx
+	js	L(combine)		C 0 more
+
+	movd	(%edx), %mm3
+	paddq	%mm3, %mm0
+
+	jz	L(combine)		C 1 more
+
+	movd	4(%edx), %mm3
+	paddq	%mm3, %mm1
+
+L(combine):
+	movq	%mm7, %mm3		C low halves
+	pand	%mm0, %mm3
+
+	movq	%mm7, %mm4
+	pand	%mm1, %mm4
+
+	movq	%mm7, %mm5
+	pand	%mm2, %mm5
+
+	psrlq	$32, %mm0		C high halves
+	psrlq	$32, %mm1
+	psrlq	$32, %mm2
+
+	paddq	%mm0, %mm4		C fold high halves to give 33 bits each
+	paddq	%mm1, %mm5
+	paddq	%mm2, %mm3
+
+	psllq	$8, %mm4		C combine at respective offsets
+	psllq	$16, %mm5
+	paddq	%mm4, %mm3
+	paddq	%mm5, %mm3		C 0x000cxxxxxxxxxxxx, 50 bits
+
+	pand	%mm3, %mm6		C fold at 24 bits
+	psrlq	$24, %mm3
+
+	paddq	%mm6, %mm3
+	movd	%mm3, %eax
+
+	ASSERT(z,	C nothing left in high dword
+	`psrlq	$32, %mm3
+	movd	%mm3, %ecx
+	orl	%ecx, %ecx')
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mode1o.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mode1o.asm
new file mode 100644
index 0000000..aa9ef31
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mode1o.asm

@@ -0,0 +1,175 @@
+dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movd	PARAM_CARRY, %mm1
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	pxor	%mm1, %mm1		C carry limb
+L(start_1c):
+	movl	PARAM_DIVISOR, %eax
+
+	movd	PARAM_DIVISOR, %mm7
+
+	shrl	%eax
+
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	C
+
+	movd	%eax, %mm6		C inv
+
+	movd	%eax, %mm0		C inv
+
+	pmuludq	%mm6, %mm6		C inv*inv
+
+	C
+
+	pmuludq	%mm7, %mm6		C inv*inv*d
+	paddd	%mm0, %mm0		C 2*inv
+
+	C
+
+	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
+	pxor	%mm6, %mm6
+
+	paddd	%mm0, %mm6
+	pmuludq	%mm0, %mm0		C inv*inv
+
+	C
+
+	pmuludq	%mm7, %mm0		C inv*inv*d
+	paddd	%mm6, %mm6		C 2*inv
+
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	C
+
+	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	movd	%mm6, %eax
+	imul	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	pxor	%mm0, %mm0		C carry bit
+
+
+C The dependent chain here is as follows.
+C
+C					latency
+C	psubq	 s = (src-cbit) - climb	   2
+C	pmuludq	 q = s*inverse		   8
+C	pmuludq	 prod = q*divisor	   8
+C	psrlq	 climb = high(prod)	   2
+C					  --
+C					  20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+	C eax	src, incrementing
+	C ebx
+	C ecx	counter, limbs
+	C edx
+	C
+	C mm0	carry bit
+	C mm1	carry limb
+	C mm6	inverse
+	C mm7	divisor
+
+	movd	(%eax), %mm2
+	addl	$4, %eax
+
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+	movq	%mm2, %mm0
+	psrlq	$63, %mm0		C new cbit
+
+	pmuludq	%mm6, %mm2		C s*inverse
+
+	movq	%mm7, %mm1
+	pmuludq	%mm2, %mm1		C q*divisor
+	psrlq	$32, %mm1		C new climb
+
+	subl	$1, %ecx
+	jnz	L(top)
+
+
+L(done):
+	paddq	%mm1, %mm0
+	movd	%mm0, %eax
+	emms
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mul_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mul_1.asm
new file mode 100644
index 0000000..6347b8b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mul_1.asm

@@ -0,0 +1,164 @@
+dnl  mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C                           cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		4.17
+C P6 model 13  (Dothan)		4.17
+C P4 model 0-1 (Willamette)	4
+C P4 model 2   (Northwood)	4
+C P4 model 3-4 (Prescott)	4.55
+
+C TODO:
+C  * Tweak eax/edx offsets in loop as to save some lea's
+C  * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C n		sp + 12
+C v0		sp + 16
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+	pxor	%mm6, %mm6
+L(ent):	mov	4(%esp), %edx
+	mov	8(%esp), %eax
+	mov	12(%esp), %ecx
+	movd	16(%esp), %mm7
+	cmp	$4, %ecx
+	jnc	L(big)
+
+L(lp0):	movd	(%eax), %mm0
+	lea	4(%eax), %eax
+	lea	4(%edx), %edx
+	pmuludq	%mm7, %mm0
+	paddq	%mm0, %mm6
+	movd	%mm6, -4(%edx)
+	psrlq	$32, %mm6
+	dec	%ecx
+	jnz	L(lp0)
+	movd	%mm6, %eax
+	emms
+	ret
+
+L(big):	and	$3, %ecx
+	je	L(0)
+	cmp	$2, %ecx
+	jc	L(1)
+	je	L(2)
+	jmp	L(3)			C FIXME: one case should fall through
+
+L(0):	movd	(%eax), %mm3
+	sub	12(%esp), %ecx		C loop count
+	lea	-16(%eax), %eax
+	lea	-12(%edx), %edx
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	pmuludq	%mm7, %mm0
+	movd	24(%eax), %mm1
+	jmp	L(00)
+
+L(1):	movd	(%eax), %mm2
+	sub	12(%esp), %ecx
+	lea	-12(%eax), %eax
+	lea	-8(%edx), %edx
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	jmp	L(01)
+
+L(2):	movd	(%eax), %mm1
+	sub	12(%esp), %ecx
+	lea	-8(%eax), %eax
+	lea	-4(%edx), %edx
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	jmp	L(10)
+
+L(3):	movd	(%eax), %mm0
+	sub	12(%esp), %ecx
+	lea	-4(%eax), %eax
+	pmuludq	%mm7, %mm0
+	movd	8(%eax), %mm1
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+
+	ALIGN(16)
+L(top):	pmuludq	%mm7, %mm2
+	paddq	%mm0, %mm6
+	movd	16(%eax), %mm3
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+L(10):	pmuludq	%mm7, %mm3
+	paddq	%mm1, %mm6
+	movd	20(%eax), %mm0
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+L(01):	pmuludq	%mm7, %mm0
+	paddq	%mm2, %mm6
+	movd	24(%eax), %mm1
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+L(00):	pmuludq	%mm7, %mm1
+	paddq	%mm3, %mm6
+	movd	28(%eax), %mm2
+	movd	%mm6, 12(%edx)
+	psrlq	$32, %mm6
+	lea	16(%eax), %eax
+	lea	16(%edx), %edx
+	add	$4, %ecx
+	ja	L(top)
+
+L(end):	pmuludq	%mm7, %mm2
+	paddq	%mm0, %mm6
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm1, %mm6
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm2, %mm6
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+	movd	%mm6, %eax
+	emms
+	ret
+EPILOGUE()
+PROLOGUE(mpn_mul_1c)
+	movd	20(%esp), %mm6
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
new file mode 100644
index 0000000..6e3775a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm

@@ -0,0 +1,662 @@
+dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
+C    scheduling could improve things by several cycles per outer iteration.
+C  * In code for un <= 3, try keeping accumulation operands in registers,
+C    without storing intermediates to rp.
+C  * We might want to keep 32 in a free mm register, since the register form is
+C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
+C  * Look into different loop alignment, we now expand the code about 50 bytes
+C    with possibly needless alignment.
+C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
+C  * Use OSP, should solve feed-in latency problems.
+C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
+C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
+C    so that they can share feed-in code, and changing the branch targets from
+C    L<n> to Lm<nn>.
+
+C                           cycles/limb
+C P6 model 9   (Banias)         ?
+C P6 model 13  (Dothan)         5.24
+C P6 model 14  (Yonah)          ?
+C P4 model 0-1 (Willamette):    5
+C P4 model 2   (Northwood):     4.60 at 32 limbs
+C P4 model 3-4 (Prescott):      4.94 at 32 limbs
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C un		sp + 12
+C vp		sp + 16
+C vn		sp + 20
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	push	%esi
+	push	%ebx
+	mov	12(%esp), %edx		C rp
+	mov	16(%esp), %eax		C up
+	mov	20(%esp), %ecx		C un
+	mov	24(%esp), %esi		C vp
+	mov	28(%esp), %ebx		C vn
+	movd	(%esi), %mm7		C
+L(ent):	cmp	$3, %ecx
+	ja	L(big)
+	movd	(%eax), %mm6
+	pmuludq	%mm7, %mm6
+	jz	L(un3)
+	cmp	$2, %ecx
+	jz	L(un2)
+
+L(un1):	movd	%mm6, (%edx)		C				un=1
+	psrlq	$32, %mm6		C				un=1
+	movd	%mm6, 4(%edx)		C				un=1
+	jmp	L(rtr)			C				un=1
+
+L(un2):	movd	4(%eax), %mm1		C				un=2
+	pmuludq	%mm7, %mm1		C				un=2
+	movd	%mm6, (%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	paddq	%mm1, %mm6		C				un=2
+	movd	%mm6, 4(%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	movd	%mm6, 8(%edx)		C				un=2
+      dec	%ebx			C				un=2
+      jz	L(rtr)			C				un=2
+	movd	4(%esi), %mm7		C				un=2
+	movd	(%eax), %mm6		C				un=2
+	pmuludq	%mm7, %mm6		C				un=2
+	movd	4(%eax), %mm1		C				un=2
+	movd	4(%edx), %mm4		C				un=2
+	pmuludq	%mm7, %mm1		C				un=2
+	movd	8(%edx), %mm5		C				un=2
+	paddq	%mm4, %mm6		C				un=2
+	paddq	%mm1, %mm5		C				un=2
+	movd	%mm6, 4(%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	paddq	%mm5, %mm6		C				un=2
+	movd	%mm6, 8(%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	movd	%mm6, 12(%edx)		C				un=2
+L(rtr):	emms
+	pop	%ebx
+	pop	%esi
+	ret
+
+L(un3):	movd	4(%eax), %mm1		C				un=3
+	pmuludq	%mm7, %mm1		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	movd	%mm6, (%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm1, %mm6		C				un=3
+	movd	%mm6, 4(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm2, %mm6		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+      dec	%ebx			C				un=3
+      jz	L(rtr)			C				un=3
+	movd	4(%esi), %mm7		C				un=3
+	movd	(%eax), %mm6		C				un=3
+	pmuludq	%mm7, %mm6		C				un=3
+	movd	4(%eax), %mm1		C				un=3
+	movd	4(%edx), %mm4		C				un=3
+	pmuludq	%mm7, %mm1		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	movd	8(%edx), %mm5		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	paddq	%mm1, %mm5		C				un=3
+	movd	12(%edx), %mm4		C				un=3
+	movd	%mm6, 4(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm5, %mm6		C				un=3
+	paddq	%mm2, %mm4		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 16(%edx)		C				un=3
+      dec	%ebx			C				un=3
+      jz	L(rtr)			C				un=3
+	movd	8(%esi), %mm7		C				un=3
+	movd	(%eax), %mm6		C				un=3
+	pmuludq	%mm7, %mm6		C				un=3
+	movd	4(%eax), %mm1		C				un=3
+	movd	8(%edx), %mm4		C				un=3
+	pmuludq	%mm7, %mm1		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	movd	12(%edx), %mm5		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	paddq	%mm1, %mm5		C				un=3
+	movd	16(%edx), %mm4		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm5, %mm6		C				un=3
+	paddq	%mm2, %mm4		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	movd	%mm6, 16(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 20(%edx)		C				un=3
+	jmp	L(rtr)
+
+
+L(big):	push	%edi
+	pxor	%mm6, %mm6
+	lea	4(%esi), %esi
+	and	$3, %ecx
+	jz	L(0)
+	cmp	$2, %ecx
+	jc	L(1)
+	jz	L(2)
+	jmp	L(3)			C FIXME: one case should fall through
+
+
+L(0):	movd	(%eax), %mm3		C				m 0
+	sub	24(%esp), %ecx		C inner loop count		m 0
+	mov	%ecx, 24(%esp)		C update loop count for later	m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	jmp	L(m00)			C				m 0
+	ALIGN(16)			C				m 0
+L(lpm0):
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	(%eax), %mm3		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	movd	%mm6, -8(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	paddq	%mm4, %mm6		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	movd	%mm6, -4(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+L(m00):	pmuludq	%mm7, %mm1		C				m 0
+	paddq	%mm3, %mm6		C				m 0
+	movd	12(%eax), %mm4		C				m 0
+	movd	%mm6, (%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	lea	16(%eax), %eax		C				m 0
+	lea	16(%edx), %edx		C				m 0
+	add	$4, %ecx		C				m 0
+	ja	L(lpm0)			C				m 0
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	mov	16(%esp), %edi		C rp				  0
+	jmp	L(x0)
+
+L(olp0):
+	lea	4(%edi), %edi		C				am 0
+	movd	(%esi), %mm7		C				am 0
+	lea	4(%esi), %esi		C				am 0
+	mov	%edi, %edx		C rp				am 0
+	mov	20(%esp), %eax		C up				am 0
+	movd	(%eax), %mm3		C				am 0
+	mov	24(%esp), %ecx		C inner loop count		am 0
+	pxor	%mm6, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	jmp	L(am00)			C				am 0
+	ALIGN(16)			C				mm 0
+L(lam0):
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	(%eax), %mm3		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+L(am00):
+	pmuludq	%mm7, %mm1		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	12(%eax), %mm2		C				am 0
+	paddq	%mm0, %mm4		C				am 0
+	movd	8(%edx), %mm5		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	lea	16(%eax), %eax		C				am 0
+	lea	16(%edx), %edx		C				am 0
+	add	$4, %ecx		C				am 0
+	jnz	L(lam0)			C				am 0
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+L(x0):	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	dec	%ebx			C				am 0
+	jnz	L(olp0)			C				am 0
+L(oel0):
+	emms				C				   0
+	pop	%edi			C				   0
+	pop	%ebx			C				   0
+	pop	%esi			C				   0
+	ret				C				   0
+
+
+L(1):	movd	(%eax), %mm4		C				m 1
+	sub	24(%esp), %ecx		C				m 1
+	mov	%ecx, 24(%esp)		C update loop count for later	m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	jmp	L(m01)			C				m 1
+	ALIGN(16)			C				m 1
+L(lpm1):
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	movd	%mm6, -4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+L(m01):	pmuludq	%mm7, %mm0		C				m 1
+	paddq	%mm4, %mm6		C				m 1
+	movd	12(%eax), %mm1		C				m 1
+	movd	%mm6, (%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm1		C				m 1
+	paddq	%mm3, %mm6		C				m 1
+	movd	16(%eax), %mm4		C				m 1
+	movd	%mm6, 4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	lea	16(%eax), %eax		C				m 1
+	lea	16(%edx), %edx		C				m 1
+	add	$4, %ecx		C				m 1
+	ja	L(lpm1)			C				m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	mov	16(%esp), %edi		C rp				  1
+	jmp	L(x1)
+
+L(olp1):
+	lea	4(%edi), %edi		C				am 1
+	movd	(%esi), %mm7		C				am 1
+	lea	4(%esi), %esi		C				am 1
+	mov	%edi, %edx		C rp				am 1
+	mov	20(%esp), %eax		C up				am 1
+	movd	(%eax), %mm2		C				am 1
+	mov	24(%esp), %ecx		C inner loop count		am 1
+	pxor	%mm6, %mm6		C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	jmp	L(am01)			C				am 1
+	ALIGN(16)			C				am 1
+L(lam1):
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+L(am01):
+	pmuludq	%mm7, %mm0		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	12(%eax), %mm1		C				am 1
+	paddq	%mm3, %mm5		C				am 1
+	movd	8(%edx), %mm4		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm1		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	16(%eax), %mm2		C				am 1
+	paddq	%mm0, %mm4		C				am 1
+	movd	12(%edx), %mm5		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	lea	16(%eax), %eax		C				am 1
+	lea	16(%edx), %edx		C				am 1
+	add	$4, %ecx		C				am 1
+	jnz	L(lam1)			C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+L(x1):	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+	dec	%ebx			C				am 1
+	jnz	L(olp1)			C				am 1
+L(oel1):
+	emms				C				   1
+	pop	%edi			C				   1
+	pop	%ebx			C				   1
+	pop	%esi			C				   1
+	ret				C				   1
+
+
+L(2):	movd	(%eax), %mm1		C				m 2
+	sub	24(%esp), %ecx		C				m 2
+	mov	%ecx, 24(%esp)		C update loop count for later	m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	movd	4(%eax), %mm4		C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	jmp	L(m10)			C				m 2
+	ALIGN(16)			C				m 2
+L(lpm2):
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+L(m10):	pmuludq	%mm7, %mm3		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	movd	12(%eax), %mm0		C				m 2
+	movd	%mm6, (%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm0		C				m 2
+	paddq	%mm4, %mm6		C				m 2
+	movd	16(%eax), %mm1		C				m 2
+	movd	%mm6, 4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	paddq	%mm3, %mm6		C				m 2
+	movd	20(%eax), %mm4		C				m 2
+	movd	%mm6, 8(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	lea	16(%eax), %eax		C				m 2
+	lea	16(%edx), %edx		C				m 2
+	add	$4, %ecx		C				m 2
+	ja	L(lpm2)			C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	mov	16(%esp), %edi		C rp				  2
+	jmp	L(x2)
+
+L(olp2):
+	lea	4(%edi), %edi		C				am 2
+	movd	(%esi), %mm7		C				am 2
+	lea	4(%esi), %esi		C				am 2
+	mov	%edi, %edx		C rp				am 2
+	mov	20(%esp), %eax		C up				am 2
+	movd	(%eax), %mm1		C				am 2
+	mov	24(%esp), %ecx		C inner loop count		am 2
+	pxor	%mm6, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	movd	4(%eax), %mm2		C				am 2
+	movd	(%edx), %mm5		C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	jmp	L(am10)			C				am 2
+	ALIGN(16)			C				am 2
+L(lam2):
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+L(am10):
+	pmuludq	%mm7, %mm3		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	12(%eax), %mm0		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+	movd	8(%edx), %mm5		C				am 2
+	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm0		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	16(%eax), %mm1		C				am 2
+	paddq	%mm3, %mm5		C				am 2
+	movd	12(%edx), %mm4		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	20(%eax), %mm2		C				am 2
+	paddq	%mm0, %mm4		C				am 2
+	movd	16(%edx), %mm5		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	lea	16(%eax), %eax		C				am 2
+	lea	16(%edx), %edx		C				am 2
+	add	$4, %ecx		C				am 2
+	jnz	L(lam2)			C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+L(x2):	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+	dec	%ebx			C				am 2
+	jnz	L(olp2)			C				am 2
+L(oel2):
+	emms				C				   2
+	pop	%edi			C				   2
+	pop	%ebx			C				   2
+	pop	%esi			C				   2
+	ret				C				   2
+
+
+L(3):	movd	(%eax), %mm0		C				m 3
+	sub	24(%esp), %ecx		C				m 3
+	mov	%ecx, 24(%esp)		C update loop count for later	m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	movd	4(%eax), %mm1		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	movd	8(%eax), %mm4		C				m 3
+	jmp	L(lpm3)			C				m 3
+	ALIGN(16)			C				m 3
+L(lpm3):
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	12(%eax), %mm3		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm3		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	movd	16(%eax), %mm0		C				m 3
+	movd	%mm6, 4(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	paddq	%mm4, %mm6		C				m 3
+	movd	20(%eax), %mm1		C				m 3
+	movd	%mm6, 8(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	paddq	%mm3, %mm6		C				m 3
+	movd	24(%eax), %mm4		C				m 3
+	movd	%mm6, 12(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	lea	16(%eax), %eax		C				m 3
+	lea	16(%edx), %edx		C				m 3
+	add	$4, %ecx		C				m 3
+	ja	L(lpm3)			C				m 3
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	mov	16(%esp), %edi		C rp				  3
+	jmp	L(x3)
+
+L(olp3):
+	lea	4(%edi), %edi		C				am 3
+	movd	(%esi), %mm7		C				am 3
+	lea	4(%esi), %esi		C				am 3
+	mov	%edi, %edx		C rp				am 3
+	mov	20(%esp), %eax		C up				am 3
+	movd	(%eax), %mm0		C				am 3
+	mov	24(%esp), %ecx		C inner loop count		am 3
+	pxor	%mm6, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	movd	4(%eax), %mm1		C				am 3
+	movd	(%edx), %mm4		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	movd	8(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	4(%edx), %mm5		C				am 3
+	jmp	L(lam3)			C				am 3
+	ALIGN(16)			C				am 3
+L(lam3):
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	12(%eax), %mm3		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm3		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	16(%eax), %mm0		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+	movd	12(%edx), %mm5		C				am 3
+	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	20(%eax), %mm1		C				am 3
+	paddq	%mm3, %mm5		C				am 3
+	movd	16(%edx), %mm4		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	24(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	20(%edx), %mm5		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	lea	16(%eax), %eax		C				am 3
+	lea	16(%edx), %edx		C				am 3
+	add	$4, %ecx		C				am 3
+	jnz	L(lam3)			C				am 3
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+L(x3):	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+	dec	%ebx			C				am 3
+	jnz	L(olp3)			C				am 3
+L(oel3):
+	emms				C				   3
+	pop	%edi			C				   3
+	pop	%ebx			C				   3
+	pop	%esi			C				   3
+	ret				C				   3
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/popcount.asm b/third_party/gmp/mpn/x86/pentium4/sse2/popcount.asm
new file mode 100644
index 0000000..c7f4426
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/popcount.asm

@@ -0,0 +1,281 @@
+dnl  X86-32 and X86-64 mpn_popcount using SSE2.
+
+dnl  Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C 32-bit		     popcount	     hamdist
+C			    cycles/limb	    cycles/limb
+C P5				-
+C P6 model 0-8,10-12		-
+C P6 model 9  (Banias)		?
+C P6 model 13 (Dothan)		4
+C P4 model 0  (Willamette)	?
+C P4 model 1  (?)		?
+C P4 model 2  (Northwood)	3.9
+C P4 model 3  (Prescott)	?
+C P4 model 4  (Nocona)		?
+C AMD K6			-
+C AMD K7			-
+C AMD K8			?
+
+C 64-bit		     popcount	     hamdist
+C			    cycles/limb	    cycles/limb
+C P4 model 4 (Nocona):		8
+C AMD K8,K9			7.5
+C AMD K10			3.5
+C Intel core2			3.68
+C Intel corei			3.15
+C Intel atom		       10.8
+C VIA nano			6.5
+
+C TODO
+C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
+C    using movdqu for one operand and movdqa for the other, or by painfully
+C    shifting as we go.  Unfortunately, there seem to be no usable shift
+C    instruction, except for one that takes an immediate count.
+C  * It would probably be possible to cut a few cycles/limb using software
+C    pipelining.
+C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
+C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
+C    integer based popcount.  Such a combined loop would handle 6 limbs in
+C    about 30 cycles on K8.
+C  * We could save a byte or two by using 32-bit operations on areg.
+C  * Check if using movdqa to a temp of and then register-based pand is faster.
+
+ifelse(GMP_LIMB_BITS,`32',
+`	define(`up',  `%edx')
+	define(`n',   `%ecx')
+	define(`areg',`%eax')
+	define(`breg',`%ebx')
+	define(`zero',`%xmm4')
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`dnl')
+',`
+	define(`up',  `%rdi')
+	define(`n',   `%rsi')
+	define(`areg',`%rax')
+	define(`breg',`%rdx')
+	define(`zero',`%xmm8')
+	define(`LIMB32',`dnl')
+	define(`LIMB64',`	$1')
+')
+
+define(`mm01010101',`%xmm6')
+define(`mm00110011',`%xmm7')
+define(`mm00001111',`%xmm2')
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
+
+undefine(`psadbw')			C override inherited m4 version
+
+C This file is shared between 32-bit and 64-bit builds.  Only the former has
+C LEAL.  Default LEAL as an alias of LEA.
+ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
+
+ASM_START()
+
+C Make cnsts global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+	define(`cnsts', MPN(popccnsts))
+	GLOBL	cnsts')
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+
+LIMB32(`mov	4(%esp), up	')
+LIMB32(`mov	8(%esp), n	')
+LIMB32(`push	%ebx		')
+
+	pxor	%xmm3, %xmm3		C zero grand total count
+LIMB64(`pxor	zero, zero	')
+
+	LEAL(	cnsts, breg)
+
+	movdqa	-48(breg), mm01010101
+	movdqa	-32(breg), mm00110011
+	movdqa	-16(breg), mm00001111
+
+	mov	up, areg
+	and	$-16, up		C round `up' down to 128-bit boundary
+	and	$12, areg		C 32:areg = 0, 4, 8, 12
+					C 64:areg = 0, 8
+	movdqa	(up), %xmm0
+	pand	64(breg,areg,4), %xmm0
+	shr	$m4_log2(GMP_LIMB_BYTES), %eax
+	add	areg, n			C compensate n for rounded down `up'
+
+	pxor	%xmm4, %xmm4
+	sub	$LIMBS_PER_XMM, n
+	jbe	L(sum)
+
+	sub	$LIMBS_PER_XMM, n
+	ja	L(ent)
+	jmp	L(lsum)
+
+	ALIGN(16)
+L(top):	movdqa	(up), %xmm0
+L(ent):	movdqa	16(up), %xmm4
+
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm5
+	psrld	$1, %xmm0
+	psrld	$1, %xmm4
+	pand	mm01010101, %xmm0
+	pand	mm01010101, %xmm4
+	psubd	%xmm0, %xmm1
+	psubd	%xmm4, %xmm5
+
+	movdqa	%xmm1, %xmm0
+	movdqa	%xmm5, %xmm4
+	psrlq	$2, %xmm1
+	psrlq	$2, %xmm5
+	pand	mm00110011, %xmm0
+	pand	mm00110011, %xmm4
+	pand	mm00110011, %xmm1
+	pand	mm00110011, %xmm5
+	paddq	%xmm0, %xmm1
+	paddq	%xmm4, %xmm5
+
+LIMB32(`pxor	zero, zero	')
+
+	add	$32, up
+	sub	$LIMBS_PER_2XMM, n
+
+	paddq	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm0
+	psrlq	$4, %xmm1
+	pand	mm00001111, %xmm0
+	pand	mm00001111, %xmm1
+	paddq	%xmm0, %xmm1
+
+	psadbw	zero, %xmm1
+	paddq	%xmm1, %xmm3		C add to grand total
+
+	jnc	L(top)
+L(end):
+	add	$LIMBS_PER_2XMM, n
+	jz	L(rt)
+	movdqa	(up), %xmm0
+	pxor	%xmm4, %xmm4
+	sub	$LIMBS_PER_XMM, n
+	jbe	L(sum)
+L(lsum):
+	movdqa	%xmm0, %xmm4
+	movdqa	16(up), %xmm0
+L(sum):
+	shl	$m4_log2(GMP_LIMB_BYTES), n
+	and	$12, n
+	pand	(breg,n,4), %xmm0
+
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm5
+	psrld	$1, %xmm0
+	psrld	$1, %xmm4
+	pand	mm01010101, %xmm0
+	pand	mm01010101, %xmm4
+	psubd	%xmm0, %xmm1
+	psubd	%xmm4, %xmm5
+
+	movdqa	%xmm1, %xmm0
+	movdqa	%xmm5, %xmm4
+	psrlq	$2, %xmm1
+	psrlq	$2, %xmm5
+	pand	mm00110011, %xmm0
+	pand	mm00110011, %xmm4
+	pand	mm00110011, %xmm1
+	pand	mm00110011, %xmm5
+	paddq	%xmm0, %xmm1
+	paddq	%xmm4, %xmm5
+
+LIMB32(`pxor	zero, zero	')
+
+	paddq	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm0
+	psrlq	$4, %xmm1
+	pand	mm00001111, %xmm0
+	pand	mm00001111, %xmm1
+	paddq	%xmm0, %xmm1
+
+	psadbw	zero, %xmm1
+	paddq	%xmm1, %xmm3		C add to grand total
+
+
+C Add the two 64-bit halves of the grand total counter
+L(rt):	movdqa	%xmm3, %xmm0
+	psrldq	$8, %xmm3
+	paddq	%xmm3, %xmm0
+	movd	%xmm0, areg		C movq avoided due to gas bug
+
+LIMB32(`pop	%ebx		')
+	ret
+
+EPILOGUE()
+DEF_OBJECT(dummy,16)
+C Three magic constants used for masking out bits
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+cnsts:
+C Masks for high end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(dummy)
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
new file mode 100644
index 0000000..f421d13
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm

@@ -0,0 +1,126 @@
+dnl  Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
+
+dnl  Copyright 2001-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C        cycles/limb (approx)
+C      dst!=src1,2  dst==src1  dst==src2
+C P4:      4.5         6.5        6.5
+
+
+C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+C                          mp_size_t size);
+C
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best.  Not sure why, but for instance leal
+C incrementing on %esi is a 1 or 2 cycle slowdown.
+C
+C The dependent chain is paddq combining the carry and next (shifted) part,
+C plus psrlq to move the new carry down.  That, and just 4 mmx instructions
+C in total, makes 4 c/l the target speed, which is almost achieved for
+C separate src/dst but when src==dst the write combining anomalies slow it
+C down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_YP,   12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_XP')
+define(SAVE_ESI,`PARAM_YP')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_rsh1add_n)
+deflit(`FRAME',0)
+
+	movl	PARAM_XP, %edx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_YP, %ebx
+	movl	%esi, SAVE_ESI
+
+	movl	PARAM_WP, %esi
+
+	movd	(%edx), %mm0		C xp[0]
+
+	movd	(%ebx), %mm1		C yp[0]
+	movl	PARAM_SIZE, %ecx
+
+	movl	(%edx), %eax		C xp[0]
+
+	addl	(%ebx), %eax		C xp[0]+yp[0]
+
+	paddq	%mm1, %mm0		C xp[0]+yp[0]
+	leal	(%esi,%ecx,4), %esi	C wp end
+	negl	%ecx			C -size
+
+	psrlq	$1, %mm0		C (xp[0]+yp[0])/2
+	and	$1, %eax		C return value, rsh1 bit of xp[0]+yp[0]
+	addl	$1, %ecx		C -(size-1)
+	jz	L(done)
+
+
+L(top):
+	C eax	return value
+	C ebx	yp end
+	C ecx	counter, limbs, -(size-1) to -1 inclusive
+	C edx	xp end
+	C esi	wp end
+	C mm0	carry (32 bits)
+
+	movd	4(%edx), %mm1	C xp[i+1]
+	movd	4(%ebx), %mm2	C yp[i+1]
+	leal	4(%edx), %edx
+	leal	4(%ebx), %ebx
+	paddq	%mm2, %mm1		C xp[i+1]+yp[i+1]
+	psllq	$31, %mm1		C low bit at 31, further 32 above
+
+	paddq	%mm1, %mm0		C 31 and carry from prev add
+	movd	%mm0, -4(%esi,%ecx,4)	C low ready to store dst[i]
+
+	psrlq	$32, %mm0		C high becomes new carry
+
+	addl	$1, %ecx
+	jnz	L(top)
+
+
+L(done):
+	movd	%mm0, -4(%esi)		C dst[size-1]
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_ESI, %esi
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm b/third_party/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..2dd57d2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm

@@ -0,0 +1,705 @@
+dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
+C    scheduling could improve things by several cycles per outer iteration.
+C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without
+C    storing intermediates to rp.
+C  * We might want to keep 32 in a free mm register, since the register form is
+C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save.
+C  * Look into different loop alignment, we now expand the code about 50 bytes
+C    with possibly needless alignment.
+C  * Use OSP, should solve feed-in latency problems.
+C  * Address relative slowness for un<=3 for Pentium M.  The old code is there
+C    considerably faster.  (1:20/14, 2:34:32, 3:66/57)
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C un		sp + 12
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	mov	4(%esp), %edx		C rp
+	mov	8(%esp), %eax		C up
+	mov	12(%esp), %ecx		C un
+
+	cmp	$2, %ecx
+	jc	L(un1)
+	jz	L(un2)
+	cmp	$4, %ecx
+	jc	L(un3)
+	jz	L(un4)
+	jmp	L(big)
+
+L(un1):	mov	(%eax), %eax
+	mov	%edx, %ecx
+	mul	%eax
+	mov	%eax, (%ecx)
+	mov	%edx, 4(%ecx)
+	ret
+L(un2):	movd	(%eax), %mm0		C				un=2
+	movd	(%eax), %mm2		C				un=2
+	movd	4(%eax), %mm1		C				un=2
+	pmuludq	%mm0, %mm0		C 64b weight 0			un=2
+	pmuludq	%mm1, %mm2		C 64b weight 32			un=2
+	pmuludq	%mm1, %mm1		C 64b weight 64			un=2
+	movd	%mm0, (%edx)		C				un=2
+	psrlq	$32, %mm0		C 32b weight 32			un=2
+	pcmpeqd	%mm7, %mm7		C				un=2
+	psrlq	$33, %mm7		C 0x000000007FFFFFFF		un=2
+	pand	%mm2, %mm7		C 31b weight 32			un=2
+	psrlq	$31, %mm2		C 33b weight 65			un=2
+	psllq	$1, %mm7		C 31b weight 33			un=2
+	paddq	%mm7, %mm0		C				un=2
+	movd	%mm0, 4(%edx)		C				un=2
+	psrlq	$32, %mm0		C				un=2
+	paddq	%mm2, %mm1		C				un=2
+	paddq	%mm0, %mm1		C				un=2
+	movd	%mm1, 8(%edx)		C				un=2
+	psrlq	$32, %mm1		C				un=2
+	movd	%mm1, 12(%edx)		C				un=2
+	emms
+	ret
+L(un3):	movd	(%eax), %mm7		C				un=3
+	movd	4(%eax), %mm6		C				un=3
+	pmuludq	%mm7, %mm6		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	movd	%mm6, 4(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm2, %mm6		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+	lea	4(%edx), %edx		C				un=3
+	lea	4(%eax), %eax		C				un=3
+	jmp	L(am1)
+L(un4):	movd	(%eax), %mm7		C				un=4
+	movd	4(%eax), %mm6		C				un=4
+	pmuludq	%mm7, %mm6		C				un=4
+	movd	8(%eax), %mm0		C				un=4
+	pmuludq	%mm7, %mm0		C				un=4
+	movd	12(%eax), %mm1		C				un=4
+	pmuludq	%mm7, %mm1		C				un=4
+	movd	%mm6, 4(%edx)		C				un=4
+	psrlq	$32, %mm6		C				un=4
+	paddq	%mm0, %mm6		C				un=4
+	movd	%mm6, 8(%edx)		C				un=4
+	psrlq	$32, %mm6		C				un=4
+	paddq	%mm1, %mm6		C				un=4
+	movd	%mm6, 12(%edx)		C				un=4
+	psrlq	$32, %mm6		C				un=4
+	movd	%mm6, 16(%edx)		C				un=4
+	lea	4(%edx), %edx		C				un=4
+	lea	4(%eax), %eax		C				un=4
+	jmp	L(am2)
+
+L(big):	push	%esi
+	push	%ebx
+	push	%edi
+	pxor	%mm6, %mm6
+	movd	(%eax), %mm7		C
+	lea	4(%eax), %esi		C init up, up++
+	lea	4(%eax), %eax		C up2++  FIXME: should fix offsets
+	lea	4(%edx), %edi		C init rp, rp++
+	lea	4(%edx), %edx		C rp2++
+	lea	-4(%ecx), %ebx		C loop count
+	and	$3, %ecx
+	jz	L(3m)
+	cmp	$2, %ecx
+	ja	L(2m)
+	jb	L(0m)
+
+L(1m):
+	movd	(%eax), %mm4		C				m 1
+	lea	(%ebx), %ecx		C inner loop count		m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	jmp	L(m01)			C				m 1
+	ALIGN(16)			C				m 1
+L(lpm1):
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	movd	%mm6, -4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+L(m01):	pmuludq	%mm7, %mm0		C				m 1
+	paddq	%mm4, %mm6		C				m 1
+	movd	12(%eax), %mm1		C				m 1
+	movd	%mm6, (%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm1		C				m 1
+	paddq	%mm3, %mm6		C				m 1
+	movd	16(%eax), %mm4		C				m 1
+	movd	%mm6, 4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	lea	16(%eax), %eax		C				m 1
+	lea	16(%edx), %edx		C				m 1
+	sub	$4, %ecx		C				m 1
+	ja	L(lpm1)			C				m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	jmp	L(0)
+
+L(2m):
+	movd	(%eax), %mm1		C				m 2
+	lea	(%ebx), %ecx		C inner loop count		m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	movd	4(%eax), %mm4		C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	jmp	L(m10)			C				m 2
+	ALIGN(16)			C				m 2
+L(lpm2):
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+L(m10):	pmuludq	%mm7, %mm3		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	movd	12(%eax), %mm0		C				m 2
+	movd	%mm6, (%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm0		C				m 2
+	paddq	%mm4, %mm6		C				m 2
+	movd	16(%eax), %mm1		C				m 2
+	movd	%mm6, 4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	paddq	%mm3, %mm6		C				m 2
+	movd	20(%eax), %mm4		C				m 2
+	movd	%mm6, 8(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	lea	16(%eax), %eax		C				m 2
+	lea	16(%edx), %edx		C				m 2
+	sub	$4, %ecx		C				m 2
+	ja	L(lpm2)			C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	jmp	L(1)
+
+L(3m):
+	movd	(%eax), %mm0		C				m 3
+	lea	(%ebx), %ecx		C inner loop count		m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	movd	4(%eax), %mm1		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	movd	8(%eax), %mm4		C				m 3
+	jmp	L(lpm3)			C				m 3
+	ALIGN(16)			C				m 3
+L(lpm3):
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	12(%eax), %mm3		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm3		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	movd	16(%eax), %mm0		C				m 3
+	movd	%mm6, 4(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	paddq	%mm4, %mm6		C				m 3
+	movd	20(%eax), %mm1		C				m 3
+	movd	%mm6, 8(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	paddq	%mm3, %mm6		C				m 3
+	movd	24(%eax), %mm4		C				m 3
+	movd	%mm6, 12(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	lea	16(%eax), %eax		C				m 3
+	lea	16(%edx), %edx		C				m 3
+	sub	$4, %ecx		C				m 3
+	ja	L(lpm3)			C				m 3
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	jmp	L(2)
+
+L(0m):
+	movd	(%eax), %mm3		C				m 0
+	lea	(%ebx), %ecx		C inner loop count		m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	jmp	L(m00)			C				m 0
+	ALIGN(16)			C				m 0
+L(lpm0):
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	(%eax), %mm3		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	movd	%mm6, -8(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	paddq	%mm4, %mm6		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	movd	%mm6, -4(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+L(m00):	pmuludq	%mm7, %mm1		C				m 0
+	paddq	%mm3, %mm6		C				m 0
+	movd	12(%eax), %mm4		C				m 0
+	movd	%mm6, (%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	lea	16(%eax), %eax		C				m 0
+	lea	16(%edx), %edx		C				m 0
+	sub	$4, %ecx		C				m 0
+	ja	L(lpm0)			C				m 0
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	jmp	L(3)
+
+L(outer):
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 3
+	mov	%edi, %edx		C rp2 = rp			am 3
+	lea	4(%esi), %esi		C up++				am 3
+	lea	(%esi), %eax		C up2 = up			am 3
+	movd	(%eax), %mm0		C				am 3
+	lea	(%ebx), %ecx		C inner loop count		am 3
+	pxor	%mm6, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	movd	4(%eax), %mm1		C				am 3
+	movd	(%edx), %mm4		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	movd	8(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	4(%edx), %mm5		C				am 3
+	jmp	L(lam3)			C				am 3
+	ALIGN(16)			C				am 3
+L(lam3):
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	12(%eax), %mm3		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm3		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	16(%eax), %mm0		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+	movd	12(%edx), %mm5		C				am 3
+	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	20(%eax), %mm1		C				am 3
+	paddq	%mm3, %mm5		C				am 3
+	movd	16(%edx), %mm4		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	24(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	20(%edx), %mm5		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	lea	16(%eax), %eax		C				am 3
+	lea	16(%edx), %edx		C				am 3
+	sub	$4, %ecx		C				am 3
+	ja	L(lam3)			C				am 3
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+L(2):	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 2
+	mov	%edi, %edx		C rp2 = rp			am 2
+	lea	4(%esi), %esi		C up++				am 2
+	lea	(%esi), %eax		C up2 = up			am 2
+	movd	(%eax), %mm1		C				am 2
+	lea	(%ebx), %ecx		C inner loop count		am 2
+	pxor	%mm6, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	movd	4(%eax), %mm2		C				am 2
+	movd	(%edx), %mm5		C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	jmp	L(am10)			C				am 2
+	ALIGN(16)			C				am 2
+L(lam2):
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+L(am10):
+	pmuludq	%mm7, %mm3		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	12(%eax), %mm0		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+	movd	8(%edx), %mm5		C				am 2
+	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm0		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	16(%eax), %mm1		C				am 2
+	paddq	%mm3, %mm5		C				am 2
+	movd	12(%edx), %mm4		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	20(%eax), %mm2		C				am 2
+	paddq	%mm0, %mm4		C				am 2
+	movd	16(%edx), %mm5		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	lea	16(%eax), %eax		C				am 2
+	lea	16(%edx), %edx		C				am 2
+	sub	$4, %ecx		C				am 2
+	ja	L(lam2)			C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+L(1):	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 1
+	mov	%edi, %edx		C rp2 = rp			am 1
+	lea	4(%esi), %esi		C up++				am 1
+	lea	(%esi), %eax		C up2 = up			am 1
+	movd	(%eax), %mm2		C				am 1
+	lea	(%ebx), %ecx		C inner loop count		am 1
+	pxor	%mm6, %mm6		C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	jmp	L(am01)			C				am 1
+	ALIGN(16)			C				am 1
+L(lam1):
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+L(am01):
+	pmuludq	%mm7, %mm0		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	12(%eax), %mm1		C				am 1
+	paddq	%mm3, %mm5		C				am 1
+	movd	8(%edx), %mm4		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm1		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	16(%eax), %mm2		C				am 1
+	paddq	%mm0, %mm4		C				am 1
+	movd	12(%edx), %mm5		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	lea	16(%eax), %eax		C				am 1
+	lea	16(%edx), %edx		C				am 1
+	sub	$4, %ecx		C				am 1
+	ja	L(lam1)			C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+L(0):	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 0
+	mov	%edi, %edx		C rp2 = rp			am 0
+	lea	4(%esi), %esi		C up++				am 0
+	lea	(%esi), %eax		C up2 = up			am 0
+	movd	(%eax), %mm3		C				am 0
+	lea	(%ebx), %ecx		C inner loop count		am 0
+	pxor	%mm6, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	jmp	L(am00)			C				am 0
+	ALIGN(16)			C				am 0
+L(lam0):
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	(%eax), %mm3		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+L(am00):
+	pmuludq	%mm7, %mm1		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	12(%eax), %mm2		C				am 0
+	paddq	%mm0, %mm4		C				am 0
+	movd	8(%edx), %mm5		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	lea	16(%eax), %eax		C				am 0
+	lea	16(%edx), %edx		C				am 0
+	sub	$4, %ecx		C				am 0
+	ja	L(lam0)			C				am 0
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+L(3):	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	sub	$4, %ebx		C				am 0
+	ja	L(outer)			C				am 0
+
+	mov	%edi, %edx
+	mov	%esi, %eax
+	pop	%edi
+	pop	%ebx
+	pop	%esi
+
+L(am3):	C up[un-1..un-3] x up[un-4]
+	lea	8(%edx), %edx		C rp2 += 2
+	movd	(%eax), %mm7
+	movd	4(%eax), %mm1
+	movd	8(%eax), %mm2
+	movd	12(%eax), %mm3
+	movd	(%edx), %mm4
+	pmuludq	%mm7, %mm1
+	movd	4(%edx), %mm5
+	pmuludq	%mm7, %mm2
+	movd	8(%edx), %mm6
+	pmuludq	%mm7, %mm3
+	paddq	%mm1, %mm4
+	paddq	%mm2, %mm5
+	paddq	%mm3, %mm6
+	movd	%mm4, (%edx)
+	psrlq	$32, %mm4
+	paddq	%mm5, %mm4
+	movd	%mm4, 4(%edx)
+	psrlq	$32, %mm4
+	paddq	%mm6, %mm4
+	movd	%mm4, 8(%edx)
+	psrlq	$32, %mm4
+	movd	%mm4, 12(%edx)		C FIXME feed through!
+	lea	4(%eax), %eax
+
+L(am2):	C up[un-1..un-2] x up[un-3]
+	lea	8(%edx), %edx		C rp2 += 2
+	movd	(%eax), %mm7
+	movd	4(%eax), %mm1
+	movd	8(%eax), %mm2
+	movd	(%edx), %mm4
+	movd	4(%edx), %mm5
+	pmuludq	%mm7, %mm1
+	pmuludq	%mm7, %mm2
+	paddq	%mm1, %mm4
+	paddq	%mm2, %mm5
+	movd	%mm4, (%edx)
+	psrlq	$32, %mm4
+	paddq	%mm5, %mm4
+	movd	%mm4, 4(%edx)
+	psrlq	$32, %mm4
+	movd	%mm4, 8(%edx)		C FIXME feed through!
+	lea	4(%eax), %eax
+
+L(am1):	C up[un-1] x up[un-2]
+	lea	8(%edx), %edx		C rp2 += 2
+	movd	(%eax), %mm7
+	movd	4(%eax), %mm2
+	movd	(%edx), %mm4
+	pmuludq	%mm7, %mm2
+	paddq	%mm2, %mm4
+	movd	%mm4, (%edx)
+	psrlq	$32, %mm4
+	movd	%mm4, 4(%edx)
+
+C *** diag stuff, use elementary code for now
+
+	mov	4(%esp), %edx		C rp
+	mov	8(%esp), %eax		C up
+	mov	12(%esp), %ecx		C un
+
+	movd	(%eax), %mm2
+	pmuludq	%mm2, %mm2		C src[0]^2
+
+	pcmpeqd	%mm7, %mm7
+	psrlq	$32, %mm7
+
+	movd	4(%edx), %mm3		C dst[1]
+
+	movd	%mm2, (%edx)
+	psrlq	$32, %mm2
+
+	psllq	$1, %mm3		C 2*dst[1]
+	paddq	%mm3, %mm2
+	movd	%mm2, 4(%edx)
+	psrlq	$32, %mm2
+
+	sub	$2, %ecx
+
+L(diag):
+	movd	4(%eax), %mm0		C src limb
+	add	$4, %eax
+	pmuludq	%mm0, %mm0
+	movq	%mm7, %mm1
+	pand	%mm0, %mm1		C diagonal low
+	psrlq	$32, %mm0		C diagonal high
+
+	movd	8(%edx), %mm3
+	psllq	$1, %mm3		C 2*dst[i]
+	paddq	%mm3, %mm1
+	paddq	%mm1, %mm2
+	movd	%mm2, 8(%edx)
+	psrlq	$32, %mm2
+
+	movd	12(%edx), %mm3
+	psllq	$1, %mm3		C 2*dst[i+1]
+	paddq	%mm3, %mm0
+	paddq	%mm0, %mm2
+	movd	%mm2, 12(%edx)
+	add	$8, %edx
+	psrlq	$32, %mm2
+
+	sub	$1, %ecx
+	jnz	L(diag)
+
+	movd	4(%eax), %mm0		C src[size-1]
+	pmuludq	%mm0, %mm0
+	pand	%mm0, %mm7		C diagonal low
+	psrlq	$32, %mm0		C diagonal high
+
+	movd	8(%edx), %mm3		C dst[2*size-2]
+	psllq	$1, %mm3
+	paddq	%mm3, %mm7
+	paddq	%mm7, %mm2
+	movd	%mm2, 8(%edx)
+	psrlq	$32, %mm2
+
+	paddq	%mm0, %mm2
+	movd	%mm2, 12(%edx)		C dst[2*size-1]
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/sub_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/sub_n.asm
new file mode 100644
index 0000000..5ba1c01
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/sub_n.asm

@@ -0,0 +1,119 @@
+dnl  Intel Pentium-4 mpn_sub_n -- mpn subtraction.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C					cycles/limb
+C			     dst!=src1,2  dst==src1  dst==src2
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4	     6		6
+C P4 model 3-4 (Prescott)	4.25	     7.5	7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_sub_nc)
+deflit(`FRAME',0)
+	movd	PARAM_CARRY, %mm0
+	jmp	L(start_nc)
+EPILOGUE()
+
+	ALIGN(8)
+PROLOGUE(mpn_sub_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+L(start_nc):
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):
+	C eax	src1 end
+	C ebx	src2 end
+	C ecx	counter, limbs, negative
+	C edx	dst end
+	C mm0	carry bit
+
+	movd	(%eax,%ecx,4), %mm1
+	movd	(%ebx,%ecx,4), %mm2
+	psubq	%mm2, %mm1
+
+	psubq	%mm0, %mm1
+	movd	%mm1, (%edx,%ecx,4)
+
+	psrlq	$63, %mm1
+
+	add	$1, %ecx
+	jz	L(done_mm1)
+
+	movd	(%eax,%ecx,4), %mm0
+	movd	(%ebx,%ecx,4), %mm2
+	psubq	%mm2, %mm0
+
+	psubq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$63, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+L(done_mm1):
+	movd	%mm1, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/submul_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/submul_1.asm
new file mode 100644
index 0000000..020675b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/submul_1.asm

@@ -0,0 +1,182 @@
+dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		6.8
+C P6 model 13  (Dothan)		6.9
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	5.87
+C P4 model 3-4 (Prescott)	6.5
+
+C This code represents a step forwards compared to the code available before
+C GMP 5.1, but it is not carefully tuned for either P6 or P4.  In fact, it is
+C not good for P6.  For P4 it saved a bit over 1 c/l for both Northwood and
+C Prescott compared to the old code.
+C
+C The arrangements made here to get a two instruction dependent chain are
+C slightly subtle.  In the loop the carry (or borrow rather) is a negative so
+C that a paddq can be used to give a low limb ready to store, and a high limb
+C ready to become the new carry after a psrlq.
+C
+C If the carry was a simple twos complement negative then the psrlq shift would
+C need to bring in 0 bits or 1 bits according to whether the high was zero or
+C non-zero, since a non-zero value would represent a negative needing sign
+C extension.  That wouldn't be particularly easy to arrange and certainly would
+C add an instruction to the dependent chain, so instead an offset is applied so
+C that the high limb will be 0xFFFFFFFF+c.  With c in the range -0xFFFFFFFF to
+C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
+C always positive and can always have 0 bits shifted in, which is what psrlq
+C does.
+C
+C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
+C done off the dependent chain.  The total adjustment then is to add
+C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
+C to remove the offset from the current carry, for a net add of
+C 0xFFFFFFFE00000001.  In the code this is applied to the destination limb when
+C fetched.
+C
+C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
+C negative, which is how it's undone for the return value, but that doesn't
+C seem as clear.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_submul_1c)
+deflit(`FRAME',0)
+	movd	PARAM_CARRY, %mm1
+	jmp	L(start_1c)
+EPILOGUE()
+
+PROLOGUE(mpn_submul_1)
+deflit(`FRAME',0)
+	pxor	%mm1, %mm1		C initial borrow
+
+L(start_1c):
+	mov	PARAM_SRC, %eax
+	pcmpeqd	%mm0, %mm0
+
+	movd	PARAM_MULTIPLIER, %mm7
+	pcmpeqd	%mm6, %mm6
+
+	mov	PARAM_DST, %edx
+	psrlq	$32, %mm0		C 0x00000000FFFFFFFF
+
+	mov	PARAM_SIZE, %ecx
+	psllq	$32, %mm6		C 0xFFFFFFFF00000000
+
+	psubq	%mm0, %mm6		C 0xFFFFFFFE00000001
+
+	psubq	%mm1, %mm0		C 0xFFFFFFFF - borrow
+
+
+	movd	(%eax), %mm3		C up
+	movd	(%edx), %mm4		C rp
+
+	add	$-1, %ecx
+	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm3
+	jnz	L(gt1)
+	psubq	%mm3, %mm4		C prod
+	paddq	%mm4, %mm0		C borrow
+	movd	%mm0, (%edx)		C result
+	jmp	L(rt)
+
+L(gt1):	movd	4(%eax), %mm1		C up
+	movd	4(%edx), %mm2		C rp
+
+	add	$-1, %ecx
+	jz	L(eev)
+
+	ALIGN(16)
+L(top):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm1
+	psubq	%mm3, %mm4		C prod
+	movd	8(%eax), %mm3		C up
+	paddq	%mm4, %mm0		C borrow
+	movd	8(%edx), %mm4		C rp
+	movd	%mm0, (%edx)		C result
+	psrlq	$32, %mm0
+
+	add	$-1, %ecx
+	jz	L(eod)
+
+	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm3
+	psubq	%mm1, %mm2		C prod
+	movd	12(%eax), %mm1		C up
+	paddq	%mm2, %mm0		C borrow
+	movd	12(%edx), %mm2		C rp
+	movd	%mm0, 4(%edx)		C result
+	psrlq	$32, %mm0
+
+	lea	8(%eax), %eax
+	lea	8(%edx), %edx
+	add	$-1, %ecx
+	jnz	L(top)
+
+
+L(eev):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm1
+	psubq	%mm3, %mm4		C prod
+	paddq	%mm4, %mm0		C borrow
+	movd	%mm0, (%edx)		C result
+	psrlq	$32, %mm0
+	psubq	%mm1, %mm2		C prod
+	paddq	%mm2, %mm0		C borrow
+	movd	%mm0, 4(%edx)		C result
+L(rt):	psrlq	$32, %mm0
+	movd	%mm0, %eax
+	not	%eax
+	emms
+	ret
+
+L(eod):	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm3
+	psubq	%mm1, %mm2		C prod
+	paddq	%mm2, %mm0		C borrow
+	movd	%mm0, 4(%edx)		C result
+	psrlq	$32, %mm0
+	psubq	%mm3, %mm4		C prod
+	paddq	%mm4, %mm0		C borrow
+	movd	%mm0, 8(%edx)		C result
+	jmp	L(rt)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/rshift.asm b/third_party/gmp/mpn/x86/rshift.asm
new file mode 100644
index 0000000..a60dcaa
--- /dev/null
+++ b/third_party/gmp/mpn/x86/rshift.asm

@@ -0,0 +1,108 @@
+dnl  x86 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54	 7.5
+C P55	 7.0
+C P6	 2.5
+C K6	 4.5
+C K7	 5.0
+C P4	16.5
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	leal	-4(%edi,%edx,4),%edi
+	leal	(%esi,%edx,4),%esi
+	negl	%edx
+
+	movl	(%esi,%edx,4),%ebx	C read least significant limb
+	xorl	%eax,%eax
+	shrdl(	%cl, %ebx, %eax)	C compute carry limb
+	incl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next higher limb
+	shrdl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	incl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shrdl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	incl	%edx
+	jnz	L(oop)
+
+	shrl	%cl,%eax		C compute most significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shrl	%cl,%ebx		C compute most significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/sec_tabselect.asm b/third_party/gmp/mpn/x86/sec_tabselect.asm
new file mode 100644
index 0000000..c7c2e05
--- /dev/null
+++ b/third_party/gmp/mpn/x86/sec_tabselect.asm

@@ -0,0 +1,115 @@
+dnl  x86 mpn_sec_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 4.5
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C Intel Atom			 ?
+C AMD K6			 ?
+C AMD K7			 3.4
+C AMD K8			 ?
+C AMD K10			 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using SSE2 could result in many-fold speedup.
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `%edi')
+define(`tp',     `%esi')
+define(`n',      `%ebx')
+define(`nents',  `%ecx')
+define(`which',  `36(%esp)')
+
+define(`i',      `%ebp')
+define(`maskp',  `20(%esp)')
+define(`maskn',  `32(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+	mov	20(%esp), rp
+	mov	24(%esp), tp
+	mov	28(%esp), n
+	mov	32(%esp), nents
+
+	lea	(rp,n,4), rp
+	lea	(tp,n,4), tp
+	sub	nents, which
+L(outer):
+	mov	which, %eax
+	add	nents, %eax
+	neg	%eax			C set CF iff 'which' != k
+	sbb	%eax, %eax
+	mov	%eax, maskn
+	not	%eax
+	mov	%eax, maskp
+
+	mov	n, i
+	neg	i
+
+	ALIGN(16)
+L(top):	mov	(tp,i,4), %eax
+	and	maskp, %eax
+	mov	(rp,i,4), %edx
+	and	maskn, %edx
+	or	%edx, %eax
+	mov	%eax, (rp,i,4)
+	inc	i
+	js	L(top)
+
+L(end):	mov	n, %eax
+	lea	(tp,%eax,4), tp
+	dec	nents
+	jne	L(outer)
+
+L(outer_end):
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/silvermont/gmp-mparam.h b/third_party/gmp/mpn/x86/silvermont/gmp-mparam.h
new file mode 100644
index 0000000..e9f1d8f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/silvermont/gmp-mparam.h

@@ -0,0 +1,222 @@
+/* Intel Silvermont/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-30, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     16
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 64.62% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           32
+
+#define DIV_1_VS_MUL_1_PERCENT             204
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD               105
+#define MUL_TOOM44_THRESHOLD               236
+#define MUL_TOOM6H_THRESHOLD               351
+#define MUL_TOOM8H_THRESHOLD               502
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     163
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     174
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     215
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 36
+#define SQR_TOOM3_THRESHOLD                138
+#define SQR_TOOM4_THRESHOLD                360
+#define SQR_TOOM6_THRESHOLD                494
+#define SQR_TOOM8_THRESHOLD                620
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define MUL_FFT_MODF_THRESHOLD             460  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    460, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415, 9}, {    831,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    735,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    863,10}, {   1727,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,11}, {   3455,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,13}, \
+    {   7935,16} }
+#define MUL_FFT_TABLE3_SIZE 177
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             400  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    400, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159,11}, {     95,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    735,11}, {    191,10}, {    383, 9}, \
+    {    799,10}, {    415, 9}, {    831,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    735, 9}, {   1471,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    863,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 175
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  56
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 137
+#define SQRLO_SQR_THRESHOLD               7373
+
+#define DC_DIV_QR_THRESHOLD                 76
+#define DC_DIVAPPR_Q_THRESHOLD             336
+#define DC_BDIV_QR_THRESHOLD                66
+#define DC_BDIV_Q_THRESHOLD                218
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               345
+#define INV_APPR_THRESHOLD                 342
+
+#define BINV_NEWTON_THRESHOLD              366
+#define REDC_1_TO_REDC_N_THRESHOLD          91
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1858
+#define MUPI_DIV_QR_THRESHOLD              171
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1830
+
+#define POWM_SEC_TABLE  3,17,102,404,1185
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               272
+#define SET_STR_PRECOMPUTE_THRESHOLD       788
+
+#define FAC_DSC_THRESHOLD                  132
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 0.59% faster than 3 */
+#define HGCD_THRESHOLD                     142
+#define HGCD_APPR_THRESHOLD                181
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   492
+#define GCDEXT_DC_THRESHOLD                365
+#define JACOBI_BASE_METHOD                   1  /* 0.41% faster than 2 */
+
+/* Tuneup completed successfully, took 147027 seconds */

diff --git a/third_party/gmp/mpn/x86/skylake/gmp-mparam.h b/third_party/gmp/mpn/x86/skylake/gmp-mparam.h
new file mode 100644
index 0000000..fb87957
--- /dev/null
+++ b/third_party/gmp/mpn/x86/skylake/gmp-mparam.h

@@ -0,0 +1,211 @@
+/* x86/skylake gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                15
+#define MOD_1_UNNORM_THRESHOLD              16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 5.63% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             12
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              17
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           18
+
+#define DIV_1_VS_MUL_1_PERCENT             348
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               208
+#define MUL_TOOM6H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     149
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     145
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     196
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 40
+#define SQR_TOOM3_THRESHOLD                129
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                354
+#define SQR_TOOM8_THRESHOLD                608
+
+#define MULMID_TOOM42_THRESHOLD             72
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               21
+
+#define MUL_FFT_MODF_THRESHOLD             530  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    530, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     51, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     83, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    351,11}, \
+    {    191,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2239,13}, {   1151,12}, {   2431,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2815,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 154
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             460  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    460, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     55,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,12}, {     63,11}, \
+    {    127,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    351,11}, \
+    {    191,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    927,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    895,11}, \
+    {   1791,14}, {    255,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,13}, {    639,12}, {   1471,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2623,13}, {   1407,12}, \
+    {   2815,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 155
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  68
+#define MULLO_MUL_N_THRESHOLD            13555
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 117
+#define SQRLO_SQR_THRESHOLD              10988
+
+#define DC_DIV_QR_THRESHOLD                 42
+#define DC_DIVAPPR_Q_THRESHOLD             163
+#define DC_BDIV_QR_THRESHOLD                66
+#define DC_BDIV_Q_THRESHOLD                160
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               165
+#define INV_APPR_THRESHOLD                 157
+
+#define BINV_NEWTON_THRESHOLD              300
+#define REDC_1_TO_REDC_N_THRESHOLD          68
+
+#define MU_DIV_QR_THRESHOLD               1718
+#define MU_DIVAPPR_Q_THRESHOLD            1685
+#define MUPI_DIV_QR_THRESHOLD               62
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1830
+
+#define POWM_SEC_TABLE  1,17,129,547,1317
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               354
+#define SET_STR_PRECOMPUTE_THRESHOLD       860
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    5  /* 1.04% faster than 3 */
+#define HGCD_THRESHOLD                     114
+#define HGCD_APPR_THRESHOLD                132
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   474
+#define GCDEXT_DC_THRESHOLD                379
+#define JACOBI_BASE_METHOD                   1  /* 27.39% faster than 4 */
+
+/* Tuneup completed successfully, took 31721 seconds */

diff --git a/third_party/gmp/mpn/x86/sqr_basecase.asm b/third_party/gmp/mpn/x86/sqr_basecase.asm
new file mode 100644
index 0000000..39f8a89
--- /dev/null
+++ b/third_party/gmp/mpn/x86/sqr_basecase.asm

@@ -0,0 +1,359 @@
+dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C     cycles/crossproduct  cycles/triangleproduct
+C P5
+C P6
+C K6
+C K7
+C P4
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+C
+C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
+C code size to do so here.
+C
+C Enhancements:
+C
+C The addmul loop here is also not unrolled like aorsmul_1.asm and
+C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
+C complexity, but if it's worth doing in the other places then it should be
+C worthwhile here.
+C
+C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
+C might be worth considering.  That'd add quite a bit to the code size, but
+C only as much as is used would be dragged into L1 cache.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	movl	(%eax), %eax
+	mull	%eax
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	pushl	%esi
+	pushl	%edi
+
+	movl	%edx, %esi	C dst[1]
+	movl	%eax, (%ecx)	C dst[0]
+
+	movl	4(%ebx), %eax
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+deflit(`FRAME',0)
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	leal	(%ecx,%edx,4), %edi	C &dst[size], end of this mul1
+	leal	(%eax,%edx,4), %esi	C &src[size]
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+	movl	(%eax), %ebp		C src[0], multiplier
+	movl	%edx, %ecx
+
+	negl	%ecx			C -size
+	xorl	%ebx, %ebx		C clear carry limb
+
+	incl	%ecx			C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%ebp
+	addl	%eax, %ebx
+	adcl	$0, %edx
+	movl	%ebx, (%edi,%ecx,4)
+	movl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+	movl	%ebx, (%edi)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last products src[size-2]*src[size-1], which is the end corner
+	C of the product triangle, is handled separately at the end to save
+	C looping overhead.  If size is 3 then it's only this that needs to
+	C be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	subl	$3, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+
+dnl  re-use parameter space
+define(VAR_OUTER,`PARAM_DST')
+
+L(outer):
+	C eax
+	C ebx
+	C ecx
+	C edx	outer loop counter, -(size-3) to -1
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+	C ebp
+
+	movl	%ecx, VAR_OUTER
+	addl	$4, %edi		C advance dst end
+
+	movl	-8(%esi,%ecx,4), %ebp	C next multiplier
+	subl	$1, %ecx
+
+	xorl	%ebx, %ebx		C initial carry limb
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, -n-1 to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%ebp
+	addl	%ebx, %eax
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	addl	$1, %ecx
+	jl	L(inner)
+
+
+	movl	%ebx, (%edi)
+	movl	VAR_OUTER, %ecx
+	incl	%ecx
+	jnz	L(outer)
+
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-3]
+
+	movl	-4(%esi), %eax
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+	addl	%eax, 0(%edi)
+	adcl	$0, %edx
+	movl	%edx, 4(%edi)		C dst high limb
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+	movl	PARAM_SIZE, %eax
+	negl	%eax
+	addl	$1, %eax		C -(size-1) and clear carry
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	rcll	8(%edi,%eax,8)
+	rcll	12(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	%eax, 8(%edi)		C dst most significant limb
+
+
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	PARAM_SRC, %esi
+	movl	(%esi), %eax		C src[0]
+	mull	%eax			C src[0]^2
+
+	movl	PARAM_SIZE, %ecx
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx			C -size
+	movl	%edx, %ebx		C initial carry
+
+	movl	%eax, 12(%edi,%ecx,8)	C dst[0]
+	incl	%ecx			C -(size-1)
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, -(size-1) to -1
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-3]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%eax
+
+	addl	%ebx, 8(%edi,%ecx,8)
+	movl	%edx, %ebx
+
+	adcl	%eax, 12(%edi,%ecx,8)
+	adcl	$0, %ebx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	addl	%ebx, 8(%edi)		C dst most significant limb
+
+	popl	%ebp
+	popl	%esi
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/t-zdisp.sh b/third_party/gmp/mpn/x86/t-zdisp.sh
new file mode 100755
index 0000000..61efdd6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/t-zdisp.sh

@@ -0,0 +1,71 @@
+#! /bin/sh
+#
+# Copyright 2000 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: cd $(builddir)/mpn
+#        $(srcdir)/x86/t-zdisp.sh
+#
+# Run the Zdisp() macro instructions through the assembler to check
+# the encodings used.  Mismatches are printed, no output means all ok.
+#
+# This program is only meant for use during development.  It can be
+# run in the mpn build directory of any x86 configuration.
+#
+# For this test the assembler needs to generate byte sized 0
+# displacements when given something like 0(%eax).  Recent versions of
+# gas are suitable (eg. 2.9.x or 2.10.x).
+
+set -e
+
+cat >tmp-zdisptest.asm <<\EOF
+
+include(`../config.m4')
+
+dnl  Redefine Zdisp_match to output its pattern and encoding.
+define(`Zdisp_match',
+`define(`Zdisp_found',1)dnl
+ifelse(`$2',0,`	$1	$2$3, $4')`'dnl
+ifelse(`$3',0,`	$1	$2, $3$4')`'dnl
+
+	.byte	$5
+')
+	.text
+	Zdisp()
+EOF
+
+m4 tmp-zdisptest.asm >tmp-zdisptest.s
+as -o tmp-zdisptest.o tmp-zdisptest.s
+
+# Demand duplicates from the instruction patterns and byte encodings.
+objdump -d tmp-zdisptest.o | awk '
+/^ *[a-z0-9]+:/ {
+	sub(/^ *[a-z0-9]+:/,"")
+        print
+}' | sort | uniq -u

diff --git a/third_party/gmp/mpn/x86/t-zdisp2.pl b/third_party/gmp/mpn/x86/t-zdisp2.pl
new file mode 100755
index 0000000..b441b65
--- /dev/null
+++ b/third_party/gmp/mpn/x86/t-zdisp2.pl

@@ -0,0 +1,147 @@
+#!/usr/bin/perl -w
+#
+# Copyright 2001, 2002 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: cd $(builddir)/mpn
+#        $(srcdir)/x86/t-zdisp2.pl
+#
+# Grep for any "0(reg...)" addressing modes coming out of the x86 .asm
+# files.  Additive expressions like "12+4-16" are recognised too.
+#
+# Old gas doesn't preserve the "0" displacement, so if it's wanted then
+# Zdisp ought to be used to give explicit .byte sequences.  See
+# mpn/x86/README.
+#
+# No output means everything is ok.  All the asm files are put through m4 in
+# PIC and non-PIC modes, and in each multi-function form, all of which can
+# take a while to run.
+#
+# This program is only meant for use during development.
+
+use strict;
+use File::Find;
+use File::Basename;
+use Getopt::Std;
+
+my %opt;
+getopts('t', \%opt);
+
+
+my $srcdir;
+open IN, '<Makefile' or die;
+while (<IN>) {
+  if (/^srcdir[ \t]*=[ \t]*(.*)/) {
+    $srcdir = $1;
+    last;
+  }
+}
+close IN or die;
+defined $srcdir or die "Cannot find \$srcdir in Makefile\n";
+
+my $filecount = 0;
+
+my $tempfile = 't-zdisp2.tmp';
+open KARA, ">$tempfile" or die;
+close KARA or die;
+
+find({ wanted => \&process, preprocess => \&process_mparam, no_chdir => 1 },
+     "$srcdir/x86");
+
+sub process {
+  if (/gmp-mparam.h$/) {
+    process_mparam($_);
+  } elsif (/\.asm$/) {
+    process_asm($_);
+  }
+}
+
+# Ensure we're using the right SQR_TOOM2_THRESHOLD for the part of the
+# tree being processed.
+sub process_mparam {
+  my $file = "$File::Find::dir/gmp-mparam.h";
+  if (-f $file) {
+    print "$file\n" if $opt{'t'};
+    open MPARAM, "<$file" or die;
+    while (<MPARAM>) {
+      if (/^#define SQR_TOOM2_THRESHOLD[ \t]*([0-9][0-9]*)/) {
+        open KARA, ">$tempfile" or die;
+        print KARA "define(\`SQR_TOOM2_THRESHOLD',$1)\n\n";
+        print "define(\`SQR_TOOM2_THRESHOLD',$1)\n" if $opt{'t'};
+        close KARA or die;
+        last;
+      }
+    }
+    close MPARAM or die;
+  }
+  return @_;
+}
+
+sub process_asm {
+  my ($file) = @_;
+  my $base = basename ($file, '.asm');
+
+  my @funs;
+  if    ($base eq 'aors_n')    { @funs = qw(add_n sub_n); }
+  elsif ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); }
+  elsif ($base eq 'popham')    { @funs = qw(popcount hamdist); }
+  elsif ($base eq 'logops_n')  { @funs = qw(and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n); }
+  elsif ($base eq 'lorrshift') { @funs = qw(lshift rshift); }
+  else                         { @funs = ($base); }
+
+  foreach my $fun (@funs) {
+    foreach my $pic ('', ' -DPIC') {
+      my $header = "$file: 0: $pic\n";
+      $filecount++;
+
+      my $m4 = "m4 -DHAVE_HOST_CPU_athlon -DOPERATION_$fun $pic ../config.m4 $tempfile $file";
+      print "$m4\n" if $opt{'t'};
+
+      open IN, "$m4 |" or die;
+      while (<IN>) {
+        next unless /([0-9+-][0-9 \t+-]*)\(%/;
+        my $pat=$1;
+        $pat = eval($pat);
+        next if ($pat != 0);
+        print "$header$_";
+        $header='';
+      }
+      close IN or die;
+    }
+  }
+}
+
+unlink($tempfile);
+print "total $filecount processed\n";
+exit 0;
+
+
+# Local variables:
+# perl-indent-level: 2
+# End:

diff --git a/third_party/gmp/mpn/x86/udiv.asm b/third_party/gmp/mpn/x86/udiv.asm
new file mode 100644
index 0000000..a3ee088
--- /dev/null
+++ b/third_party/gmp/mpn/x86/udiv.asm

@@ -0,0 +1,52 @@
+dnl  x86 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C                           mp_limb_t divisor);
+
+defframe(PARAM_DIVISOR, 16)
+defframe(PARAM_LOW,     12)
+defframe(PARAM_HIGH,    8)
+defframe(PARAM_REMPTR,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_udiv_qrnnd)
+deflit(`FRAME',0)
+	movl	PARAM_LOW, %eax
+	movl	PARAM_HIGH, %edx
+	divl	PARAM_DIVISOR
+	movl	PARAM_REMPTR, %ecx
+	movl	%edx, (%ecx)
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/umul.asm b/third_party/gmp/mpn/x86/umul.asm
new file mode 100644
index 0000000..34fe434
--- /dev/null
+++ b/third_party/gmp/mpn/x86/umul.asm

@@ -0,0 +1,51 @@
+dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+defframe(PARAM_M2,    12)
+defframe(PARAM_M1,     8)
+defframe(PARAM_LOWPTR, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_umul_ppmm)
+deflit(`FRAME',0)
+	movl	PARAM_LOWPTR, %ecx
+	movl	PARAM_M1, %eax
+	mull	PARAM_M2
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/x86-defs.m4 b/third_party/gmp/mpn/x86/x86-defs.m4
new file mode 100644
index 0000000..81309b2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/x86-defs.m4

@@ -0,0 +1,1024 @@
+divert(-1)
+
+dnl  m4 macros for x86 assembler.
+
+dnl  Copyright 1999-2003, 2007, 2010, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Notes:
+dnl
+dnl  m4 isn't perfect for processing BSD style x86 assembler code, the main
+dnl  problems are,
+dnl
+dnl  1. Doing define(foo,123) and then using foo in an addressing mode like
+dnl     foo(%ebx) expands as a macro rather than a constant.  This is worked
+dnl     around by using deflit() from asm-defs.m4, instead of define().
+dnl
+dnl  2. Immediates in macro definitions need a space or `' to stop the $
+dnl     looking like a macro parameter.  For example,
+dnl
+dnl	        define(foo, `mov $ 123, %eax')
+dnl
+dnl     This is only a problem in macro definitions, not in ordinary text,
+dnl     and not in macro parameters like text passed to forloop() or ifdef().
+
+
+deflit(GMP_LIMB_BYTES, 4)
+
+
+dnl  Libtool gives -DPIC -DDLL_EXPORT to indicate a cygwin or mingw DLL.  We
+dnl  undefine PIC since we don't need to be position independent in this
+dnl  case and definitely don't want the ELF style _GLOBAL_OFFSET_TABLE_ etc.
+
+ifdef(`DLL_EXPORT',`undefine(`PIC')')
+
+
+dnl  Usage: CPUVEC_FUNCS_LIST
+dnl
+dnl  A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the
+dnl  order they appear in that structure.
+
+define(CPUVEC_FUNCS_LIST,
+``add_n',
+`addlsh1_n',
+`addlsh2_n',
+`addmul_1',
+`addmul_2',
+`bdiv_dbm1c',
+`cnd_add_n',
+`cnd_sub_n',
+`com',
+`copyd',
+`copyi',
+`divexact_1',
+`divrem_1',
+`gcd_11',
+`lshift',
+`lshiftc',
+`mod_1',
+`mod_1_1p',
+`mod_1_1p_cps',
+`mod_1s_2p',
+`mod_1s_2p_cps',
+`mod_1s_4p',
+`mod_1s_4p_cps',
+`mod_34lsub1',
+`modexact_1c_odd',
+`mul_1',
+`mul_basecase',
+`mullo_basecase',
+`preinv_divrem_1',
+`preinv_mod_1',
+`redc_1',
+`redc_2',
+`rshift',
+`sqr_basecase',
+`sub_n',
+`sublsh1_n',
+`submul_1'')
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  In the x86 code we use explicit TEXT and ALIGN() calls in the code,
+dnl  since different alignments are wanted in various circumstances.  So for
+dnl  instance,
+dnl
+dnl                  TEXT
+dnl                  ALIGN(16)
+dnl          PROLOGUE(mpn_add_n)
+dnl          ...
+dnl          EPILOGUE()
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+m4_assert_defined(`WANT_PROFILING')
+	`GLOBL	$1
+	TYPE($1,`function')
+	COFF_TYPE($1)
+$1:
+ifelse(WANT_PROFILING,`prof',      `	call_mcount')
+ifelse(WANT_PROFILING,`gprof',     `	call_mcount')
+ifelse(WANT_PROFILING,`instrument',`	call_instrument(enter)')
+')
+
+
+dnl  Usage: COFF_TYPE(GSYM_PREFIX`'foo)
+dnl
+dnl  Emit COFF style ".def ... .endef" type information for a function, when
+dnl  supported.  The argument should include any GSYM_PREFIX.
+dnl
+dnl  See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE.
+
+define(COFF_TYPE,
+m4_assert_numargs(1)
+m4_assert_defined(`HAVE_COFF_TYPE')
+`ifelse(HAVE_COFF_TYPE,yes,
+	`.def	$1
+	.scl	2
+	.type	32
+	.endef')')
+
+
+dnl  Usage: call_mcount
+dnl
+dnl  For `gprof' style profiling, %ebp is setup as a frame pointer.  None of
+dnl  the assembler routines use %ebp this way, so it's done only for the
+dnl  benefit of mcount.  glibc sysdeps/i386/i386-mcount.S shows how mcount
+dnl  gets the current function from (%esp) and the parent from 4(%ebp).
+dnl
+dnl  For `prof' style profiling gcc generates mcount calls without setting
+dnl  up %ebp, and the same is done here.
+
+define(`call_mcount',
+m4_assert_numargs(-1)
+m4_assert_defined(`WANT_PROFILING')
+m4_assert_defined(`MCOUNT_PIC_REG')
+m4_assert_defined(`MCOUNT_NONPIC_REG')
+m4_assert_defined(`MCOUNT_PIC_CALL')
+m4_assert_defined(`MCOUNT_NONPIC_CALL')
+`ifelse(ifdef(`PIC',`MCOUNT_PIC_REG',`MCOUNT_NONPIC_REG'),,,
+`	DATA
+	ALIGN(4)
+L(mcount_data_`'mcount_counter):
+	W32	0
+	TEXT
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+`	pushl	%ebp
+	movl	%esp, %ebp
+')dnl
+ifdef(`PIC',
+`	pushl	%ebx
+	call_movl_eip_to_ebx
+L(mcount_here_`'mcount_counter):
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(mcount_here_`'mcount_counter)], %ebx
+ifelse(MCOUNT_PIC_REG,,,
+`	leal	L(mcount_data_`'mcount_counter)@GOTOFF(%ebx), MCOUNT_PIC_REG')
+MCOUNT_PIC_CALL
+	popl	%ebx
+',`dnl non-PIC
+ifelse(MCOUNT_NONPIC_REG,,,
+`	movl	`$'L(mcount_data_`'mcount_counter), MCOUNT_NONPIC_REG
+')dnl
+MCOUNT_NONPIC_CALL
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+`	popl	%ebp
+')
+define(`mcount_counter',incr(mcount_counter))
+')
+
+define(mcount_counter,1)
+
+
+dnl  Usage: call_instrument(enter|exit)
+dnl
+dnl  Call __cyg_profile_func_enter or __cyg_profile_func_exit.
+dnl
+dnl  For PIC, most routines don't require _GLOBAL_OFFSET_TABLE_ themselves
+dnl  so %ebx is just setup for these calls.  It's a bit wasteful to repeat
+dnl  the setup for the exit call having done it earlier for the enter, but
+dnl  there's nowhere very convenient to hold %ebx through the length of a
+dnl  routine, in general.
+dnl
+dnl  For PIC, because instrument_current_function will be within the current
+dnl  object file we can get it just as an offset from %eip, there's no need
+dnl  to use the GOT.
+dnl
+dnl  No attempt is made to maintain the stack alignment gcc generates with
+dnl  -mpreferred-stack-boundary.  This wouldn't be hard, but it seems highly
+dnl  unlikely the instrumenting functions would be doing anything that'd
+dnl  benefit from alignment, in particular they're unlikely to be using
+dnl  doubles or long doubles on the stack.
+dnl
+dnl  The FRAME scheme is used to conveniently account for the register saves
+dnl  before accessing the return address.  Any previous value is saved and
+dnl  restored, since plenty of code keeps a value across a "ret" in the
+dnl  middle of a routine.
+
+define(call_instrument,
+m4_assert_numargs(1)
+`	pushdef(`FRAME',0)
+ifelse($1,exit,
+`	pushl	%eax	FRAME_pushl()	C return value
+')
+ifdef(`PIC',
+`	pushl	%ebx	FRAME_pushl()
+	call_movl_eip_to_ebx
+L(instrument_here_`'instrument_count):
+	movl	%ebx, %ecx
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(instrument_here_`'instrument_count)], %ebx
+	C use addl rather than leal to avoid old gas bugs, see mpn/x86/README
+	addl	$instrument_current_function-L(instrument_here_`'instrument_count), %ecx
+	pushl	m4_empty_if_zero(FRAME)(%esp)	FRAME_pushl()	C return addr
+	pushl	%ecx				FRAME_pushl()	C this function
+	call	GSYM_PREFIX`'__cyg_profile_func_$1@PLT
+	addl	$`'8, %esp
+	popl	%ebx
+',
+`	C non-PIC
+	pushl	m4_empty_if_zero(FRAME)(%esp)	FRAME_pushl()	C return addr
+	pushl	$instrument_current_function	FRAME_pushl()	C this function
+	call	GSYM_PREFIX`'__cyg_profile_func_$1
+	addl	$`'8, %esp
+')
+ifelse($1,exit,
+`	popl	%eax			C return value
+')
+	popdef(`FRAME')
+define(`instrument_count',incr(instrument_count))
+')
+define(instrument_count,1)
+
+
+dnl  Usage: instrument_current_function
+dnl
+dnl  Return the current function name for instrumenting purposes.  This is
+dnl  PROLOGUE_current_function, but it sticks at the first such name seen.
+dnl
+dnl  Sticking to the first name seen ensures that multiple-entrypoint
+dnl  functions like mpn_add_nc and mpn_add_n will make enter and exit calls
+dnl  giving the same function address.
+
+define(instrument_current_function,
+m4_assert_numargs(-1)
+`ifdef(`instrument_current_function_seen',
+`instrument_current_function_seen',
+`define(`instrument_current_function_seen',PROLOGUE_current_function)dnl
+PROLOGUE_current_function')')
+
+
+dnl  Usage: call_movl_eip_to_ebx
+dnl
+dnl  Generate a call to L(movl_eip_to_ebx), and record the need for that
+dnl  routine.
+
+define(call_movl_eip_to_ebx,
+m4_assert_numargs(-1)
+`call	L(movl_eip_to_ebx)
+define(`movl_eip_to_ebx_needed',1)')
+
+dnl  Usage: generate_movl_eip_to_ebx
+dnl
+dnl  Emit a L(movl_eip_to_ebx) routine, if needed and not already generated.
+
+define(generate_movl_eip_to_ebx,
+m4_assert_numargs(-1)
+`ifelse(movl_eip_to_ebx_needed,1,
+`ifelse(movl_eip_to_ebx_done,1,,
+`L(movl_eip_to_ebx):
+	movl	(%esp), %ebx
+	ret_internal
+define(`movl_eip_to_ebx_done',1)
+')')')
+
+
+dnl  Usage: ret
+dnl
+dnl  Generate a "ret", but if doing instrumented profiling then call
+dnl  __cyg_profile_func_exit first.
+
+define(ret,
+m4_assert_numargs(-1)
+m4_assert_defined(`WANT_PROFILING')
+`ifelse(WANT_PROFILING,instrument,
+`ret_instrument',
+`ret_internal')
+generate_movl_eip_to_ebx
+')
+
+
+dnl  Usage: ret_internal
+dnl
+dnl  A plain "ret", without any __cyg_profile_func_exit call.  This can be
+dnl  used for a return which is internal to some function, such as when
+dnl  getting %eip for PIC.
+
+define(ret_internal,
+m4_assert_numargs(-1)
+``ret'')
+
+
+dnl  Usage: ret_instrument
+dnl
+dnl  Generate call to __cyg_profile_func_exit and then a ret.  If a ret has
+dnl  already been seen from this function then jump to that chunk of code,
+dnl  rather than emitting it again.
+
+define(ret_instrument,
+m4_assert_numargs(-1)
+`ifelse(m4_unquote(ret_instrument_seen_`'instrument_current_function),1,
+`jmp	L(instrument_exit_`'instrument_current_function)',
+`define(ret_instrument_seen_`'instrument_current_function,1)
+L(instrument_exit_`'instrument_current_function):
+call_instrument(exit)
+	ret_internal')')
+
+
+dnl  Usage: _GLOBAL_OFFSET_TABLE_
+dnl
+dnl  Expand to _GLOBAL_OFFSET_TABLE_ plus any necessary underscore prefix.
+dnl  This lets us write plain _GLOBAL_OFFSET_TABLE_ in SVR4 style, but still
+dnl  work with systems requiring an extra underscore such as OpenBSD.
+dnl
+dnl  deflit is used so "leal _GLOBAL_OFFSET_TABLE_(%eax), %ebx" will come
+dnl  out right, though that form doesn't work properly in gas (see
+dnl  mpn/x86/README).
+
+deflit(_GLOBAL_OFFSET_TABLE_,
+m4_assert_defined(`GOT_GSYM_PREFIX')
+`GOT_GSYM_PREFIX`_GLOBAL_OFFSET_TABLE_'')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various x86 macros.
+dnl
+
+
+dnl  Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl  Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl  This is useful for testing, for example align to something very strict
+dnl  and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl  Generally you wouldn't execute across the padding, but it's done with
+dnl  nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,`	nop
+')')
+
+
+dnl  Usage: defframe(name,offset)
+dnl
+dnl  Make a definition like the following with which to access a parameter
+dnl  or variable on the stack.
+dnl
+dnl         define(name,`FRAME+offset(%esp)')
+dnl
+dnl  Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl  byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
+dnl  Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl  zero offset is wanted.
+dnl
+dnl  The new macro also gets a check that when it's used FRAME is actually
+dnl  defined, and that the final %esp offset isn't negative, which would
+dnl  mean an attempt to access something below the current %esp.
+dnl
+dnl  deflit() is used rather than a plain define(), so the new macro won't
+dnl  delete any following parenthesized expression.  name(%edi) will come
+dnl  out say as 16(%esp)(%edi).  This isn't valid assembler and should
+dnl  provoke an error, which is better than silently giving just 16(%esp).
+dnl
+dnl  See README for more on the suggested way to access the stack frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%esp)')')
+
+dnl  Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+m4_assert_numargs(1)
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl  Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl  Usage: FRAME_pushl()
+dnl         FRAME_popl()
+dnl         FRAME_addl_esp(n)
+dnl         FRAME_subl_esp(n)
+dnl
+dnl  Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
+dnl  %esp of n bytes.
+dnl
+dnl  Using these macros is completely optional.  Sometimes it makes more
+dnl  sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl  jumps and different sequences of FRAME values need to be used in
+dnl  different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl  Usage: defframe_pushl(name)
+dnl
+dnl  Do a combination FRAME_pushl() and a defframe() to name the stack
+dnl  location just pushed.  This should come after a pushl instruction.
+dnl  Putting it on the same line works and avoids lengthening the code.  For
+dnl  example,
+dnl
+dnl         pushl   %eax     defframe_pushl(VAR_COUNTER)
+dnl
+dnl  Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl  current value without tracking future changes.
+
+define(defframe_pushl,
+m4_assert_numargs(1)
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Assembler instruction macros.
+dnl
+
+
+dnl  Usage: emms_or_femms
+dnl         femms_available_p
+dnl
+dnl  femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
+dnl  femms instruction is available.  emms_or_femms expands to femms if
+dnl  available, or emms if not.
+dnl
+dnl  emms_or_femms is meant for use in the K6 directory where plain K6
+dnl  (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
+dnl  supported together.
+dnl
+dnl  On K7 femms is no longer faster and is just an alias for emms, so plain
+dnl  emms may as well be used.
+
+define(femms_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+	`HAVE_HOST_CPU_k62',
+	`HAVE_HOST_CPU_k63',
+	`HAVE_HOST_CPU_athlon')')
+
+define(emms_or_femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,`femms',`emms')')
+
+
+dnl  Usage: femms
+dnl
+dnl  Gas 2.9.1 which comes with FreeBSD 3.4 doesn't support femms, so the
+dnl  following is a replacement using .byte.
+
+define(femms,
+m4_assert_numargs(-1)
+`.byte	15,14	C AMD 3DNow femms')
+
+
+dnl  Usage: jadcl0(op)
+dnl
+dnl  Generate a jnc/incl as a substitute for adcl $0,op.  Note this isn't an
+dnl  exact replacement, since it doesn't set the flags like adcl does.
+dnl
+dnl  This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
+dnl  mpn_sqr_basecase because on K6 an adcl is slow, the branch
+dnl  misprediction penalty is small, and the multiply algorithm used leads
+dnl  to a carry bit on average only 1/4 of the time.
+dnl
+dnl  jadcl0_disabled can be set to 1 to instead generate an ordinary adcl
+dnl  for comparison.  For example,
+dnl
+dnl		define(`jadcl0_disabled',1)
+dnl
+dnl  When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
+dnl  the same size as an adcl.  This makes it possible to use the exact same
+dnl  computed jump code when testing the relative speed of the two.
+
+define(jadcl0,
+m4_assert_numargs(1)
+`ifelse(jadcl0_disabled,1,
+	`adcl	$`'0, $1',
+	`jnc	L(jadcl0_`'jadcl0_counter)
+	incl	$1
+L(jadcl0_`'jadcl0_counter):
+define(`jadcl0_counter',incr(jadcl0_counter))')')
+
+define(jadcl0_counter,1)
+
+
+dnl  Usage: x86_lookup(target, key,value, key,value, ...)
+dnl         x86_lookup_p(target, key,value, key,value, ...)
+dnl
+dnl  Look for `target' among the `key' parameters.
+dnl
+dnl  x86_lookup expands to the corresponding `value', or generates an error
+dnl  if `target' isn't found.
+dnl
+dnl  x86_lookup_p expands to 1 if `target' is found, or 0 if not.
+
+define(x86_lookup,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+define(x86_lookup_p,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1, `0',
+`ifelse(`$1',`$2',    `1',
+`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
+
+
+dnl  Usage: x86_opcode_reg32(reg)
+dnl         x86_opcode_reg32_p(reg)
+dnl
+dnl  x86_opcode_reg32 expands to the standard 3 bit encoding for the given
+dnl  32-bit register, eg. `%ebp' turns into 5.
+dnl
+dnl  x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
+dnl  if not.
+
+define(x86_opcode_reg32,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_p,
+m4_assert_onearg()
+`x86_lookup_p(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_list,
+``%eax',0,
+`%ecx',1,
+`%edx',2,
+`%ebx',3,
+`%esp',4,
+`%ebp',5,
+`%esi',6,
+`%edi',7')
+
+
+dnl  Usage: x86_opcode_tttn(cond)
+dnl
+dnl  Expand to the 4-bit "tttn" field value for the given x86 branch
+dnl  condition (like `c', `ae', etc).
+
+define(x86_opcode_tttn,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_ttn_list)')
+
+define(x86_opcode_tttn_list,
+``o',  0,
+`no',  1,
+`b',   2, `c',  2, `nae',2,
+`nb',  3, `nc', 3, `ae', 3,
+`e',   4, `z',  4,
+`ne',  5, `nz', 5,
+`be',  6, `na', 6,
+`nbe', 7, `a',  7,
+`s',   8,
+`ns',  9,
+`p',  10, `pe', 10, `npo',10,
+`np', 11, `npe',11, `po', 11,
+`l',  12, `nge',12,
+`nl', 13, `ge', 13,
+`le', 14, `ng', 14,
+`nle',15, `g',  15')
+
+
+dnl  Usage: cmovCC(%srcreg,%dstreg)
+dnl
+dnl  Emit a cmov instruction, using a .byte sequence, since various past
+dnl  versions of gas don't know cmov.  For example,
+dnl
+dnl         cmovz(  %eax, %ebx)
+dnl
+dnl  The source operand can only be a plain register.  (m4 code implementing
+dnl  full memory addressing modes exists, believe it or not, but isn't
+dnl  currently needed and isn't included.)
+dnl
+dnl  All the standard conditions are defined.  Attempting to use one without
+dnl  the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
+dnl  an error.  This protects against writing something old gas wouldn't
+dnl  understand.
+
+dnl  Called: define_cmov_many(cond,tttn,cond,tttn,...)
+define(define_cmov_many,
+`ifelse(m4_length(`$1'),0,,
+`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
+
+dnl  Called: define_cmov(cond,tttn)
+dnl  Emit basically define(cmov<cond>,`cmov_internal(<cond>,<ttn>,`$1',`$2')')
+define(define_cmov,
+m4_assert_numargs(2)
+`define(`cmov$1',
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`cmov_internal'(m4_doublequote($`'0),``$2'',dnl
+m4_doublequote($`'1),m4_doublequote($`'2)))')
+
+define_cmov_many(x86_opcode_tttn_list)
+
+dnl  Called: cmov_internal(name,tttn,src,dst)
+define(cmov_internal,
+m4_assert_numargs(4)
+`.byte	dnl
+15, dnl
+eval(64+$2), dnl
+eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
+	C `$1 $3, $4'')
+
+
+dnl  Usage: x86_opcode_regmmx(reg)
+dnl
+dnl  Validate the given mmx register, and return its number, 0 to 7.
+
+define(x86_opcode_regmmx,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_regmmx_list)')
+
+define(x86_opcode_regmmx_list,
+``%mm0',0,
+`%mm1',1,
+`%mm2',2,
+`%mm3',3,
+`%mm4',4,
+`%mm5',5,
+`%mm6',6,
+`%mm7',7')
+
+
+dnl  Usage: psadbw(%srcreg,%dstreg)
+dnl
+dnl  Oldish versions of gas don't know psadbw, in particular gas 2.9.1 on
+dnl  FreeBSD 3.3 and 3.4 doesn't, so instead emit .byte sequences.  For
+dnl  example,
+dnl
+dnl         psadbw( %mm1, %mm2)
+dnl
+dnl  Only register->register forms are supported here, which suffices for
+dnl  the current code.
+
+define(psadbw,
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`.byte 0x0f,0xf6,dnl
+eval(192+x86_opcode_regmmx(`$2')*8+x86_opcode_regmmx(`$1')) dnl
+	C `psadbw $1, $2'')
+
+
+dnl  Usage: Zdisp(inst,op,op,op)
+dnl
+dnl  Generate explicit .byte sequences if necessary to force a byte-sized
+dnl  zero displacement on an instruction.  For example,
+dnl
+dnl         Zdisp(  movl,   0,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 .byte   139,70,0  C movl 0(%esi), %eax
+dnl
+dnl  If the displacement given isn't 0, then normal assembler code is
+dnl  generated.  For example,
+dnl
+dnl         Zdisp(  movl,   4,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 movl    4(%esi), %eax
+dnl
+dnl  This means a single Zdisp() form can be used with an expression for the
+dnl  displacement, and .byte will be used only if necessary.  The
+dnl  displacement argument is eval()ed.
+dnl
+dnl  Because there aren't many places a 0(reg) form is wanted, Zdisp is
+dnl  implemented with a table of instructions and encodings.  A new entry is
+dnl  needed for any different operation or registers.  The table is split
+dnl  into separate macros to avoid overflowing BSD m4 macro expansion space.
+
+define(Zdisp,
+m4_assert_numargs(4)
+`define(`Zdisp_found',0)dnl
+Zdisp_1($@)dnl
+Zdisp_2($@)dnl
+Zdisp_3($@)dnl
+Zdisp_4($@)dnl
+ifelse(Zdisp_found,0,
+`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
+')')')
+
+define(Zdisp_1,`dnl
+Zdisp_match( adcl, 0,(%edx), %eax,        `0x13,0x42,0x00',           $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %ebx,        `0x13,0x5a,0x00',           $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %esi,        `0x13,0x72,0x00',           $@)`'dnl
+Zdisp_match( addl, %ebx, 0,(%edi),        `0x01,0x5f,0x00',           $@)`'dnl
+Zdisp_match( addl, %ecx, 0,(%edi),        `0x01,0x4f,0x00',           $@)`'dnl
+Zdisp_match( addl, %esi, 0,(%edi),        `0x01,0x77,0x00',           $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %eax,        `0x1b,0x42,0x00',           $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %esi,        `0x1b,0x72,0x00',           $@)`'dnl
+Zdisp_match( subl, %ecx, 0,(%edi),        `0x29,0x4f,0x00',           $@)`'dnl
+Zdisp_match( movzbl, 0,(%eax,%ebp), %eax, `0x0f,0xb6,0x44,0x28,0x00', $@)`'dnl
+Zdisp_match( movzbl, 0,(%ecx,%edi), %edi, `0x0f,0xb6,0x7c,0x39,0x00', $@)`'dnl
+Zdisp_match( adc, 0,(%ebx,%ecx,4), %eax,  `0x13,0x44,0x8b,0x00',      $@)`'dnl
+Zdisp_match( sbb, 0,(%ebx,%ecx,4), %eax,  `0x1b,0x44,0x8b,0x00',      $@)`'dnl
+')
+define(Zdisp_2,`dnl
+Zdisp_match( movl, %eax, 0,(%edi),        `0x89,0x47,0x00',           $@)`'dnl
+Zdisp_match( movl, %ebx, 0,(%edi),        `0x89,0x5f,0x00',           $@)`'dnl
+Zdisp_match( movl, %esi, 0,(%edi),        `0x89,0x77,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %eax,        `0x8b,0x43,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %esi,        `0x8b,0x73,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%edx), %eax,        `0x8b,0x42,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%esi), %eax,        `0x8b,0x46,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00',      $@)`'dnl
+Zdisp_match( mov, 0,(%esi,%ecx,4), %eax,  `0x8b,0x44,0x8e,0x00',      $@)`'dnl
+Zdisp_match( mov, %eax, 0,(%edi,%ecx,4),  `0x89,0x44,0x8f,0x00',      $@)`'dnl
+')
+define(Zdisp_3,`dnl
+Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%ecx,4), %mm0, `0x0f,0x6f,0x44,0x8b,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%edx), %mm0,        `0x0f,0x6f,0x42,0x00',      $@)`'dnl
+Zdisp_match( movq, 0,(%esi), %mm0,        `0x0f,0x6f,0x46,0x00',      $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edi),        `0x0f,0x7f,0x47,0x00',      $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
+')
+define(Zdisp_4,`dnl
+Zdisp_match( movd, 0,(%eax,%ecx,4), %mm0, `0x0f,0x6e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%edx,%ecx,4), `0x0f,0x7e,0x44,0x8a,0x00', $@)`'dnl
+')
+
+define(Zdisp_match,
+m4_assert_numargs(9)
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',0)
+	&& m4_stringequal_p(`$3',`$8')
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$7'),0,
+`	.byte	$5  C `$1 0$3, $4'',
+`	$6	$7$8, $9')',
+
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',`$7')
+	&& m4_stringequal_p(`$3',0)
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$8'),0,
+`	.byte	$5  C `$1 $2, 0$4'',
+`	$6	$7, $8$9')')')')
+
+
+dnl  Usage: shldl(count,src,dst)
+dnl         shrdl(count,src,dst)
+dnl         shldw(count,src,dst)
+dnl         shrdw(count,src,dst)
+dnl
+dnl  Generate a double-shift instruction, possibly omitting a %cl count
+dnl  parameter if that's what the assembler requires, as indicated by
+dnl  WANT_SHLDL_CL in config.m4.  For example,
+dnl
+dnl         shldl(  %cl, %eax, %ebx)
+dnl
+dnl  turns into either
+dnl
+dnl         shldl   %cl, %eax, %ebx
+dnl  or
+dnl         shldl   %eax, %ebx
+dnl
+dnl  Immediate counts are always passed through unchanged.  For example,
+dnl
+dnl         shrdl(  $2, %esi, %edi)
+dnl  becomes
+dnl         shrdl   $2, %esi, %edi
+dnl
+dnl
+dnl  If you forget to use the macro form "shldl( ...)" and instead write
+dnl  just a plain "shldl ...", an error results.  This ensures the necessary
+dnl  variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+m4_assert_numargs(1)
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl  Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl  Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1'	`$3', `$4'',
+``$1'	`$2', `$3', `$4'')')
+
+
+dnl  Usage: ASSERT([cond][,instructions])
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(nc)
+dnl
+dnl  When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl  flags, but the instructions themselves must preserve any registers that
+dnl  matter.  FRAME is adjusted for the push and pop, so the instructions
+dnl  given can use defframe() stack variables.
+dnl
+dnl  The condition can be omitted to just output the given instructions when
+dnl  assertion checking is wanted.  In this case the pushf/popf is omitted.
+dnl  For example,
+dnl
+dnl         ASSERT(, `movl %eax, VAR_KEEPVAL')
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$1',,
+	`$2',
+	`C ASSERT
+ifelse(`$2',,,`	pushf	ifdef(`FRAME',`FRAME_pushl()')')
+	$2
+	j`$1'	L(ASSERT_ok`'ASSERT_counter)
+	ud2	C assertion failed
+L(ASSERT_ok`'ASSERT_counter):
+ifelse(`$2',,,`	popf	ifdef(`FRAME',`FRAME_popl()')')
+define(`ASSERT_counter',incr(ASSERT_counter))')')')
+
+define(ASSERT_counter,1)
+
+
+dnl  Usage: movl_text_address(label,register)
+dnl
+dnl  Get the address of a text segment label, using either a plain movl or a
+dnl  position-independent calculation, as necessary.  For example,
+dnl
+dnl         movl_code_address(L(foo),%eax)
+dnl
+dnl  This macro is only meant for use in ASSERT()s or when testing, since
+dnl  the PIC sequence it generates will want to be done with a ret balancing
+dnl  the call on CPUs with return address branch prediction.
+dnl
+dnl  The addl generated here has a backward reference to the label, and so
+dnl  won't suffer from the two forwards references bug in old gas (described
+dnl  in mpn/x86/README).
+
+define(movl_text_address,
+m4_assert_numargs(2)
+`ifdef(`PIC',
+	`call	L(movl_text_address_`'movl_text_address_counter)
+L(movl_text_address_`'movl_text_address_counter):
+	popl	$2	C %eip
+	addl	`$'$1-L(movl_text_address_`'movl_text_address_counter), $2
+define(`movl_text_address_counter',incr(movl_text_address_counter))',
+	`movl	`$'$1, $2')')
+
+define(movl_text_address_counter,1)
+
+
+dnl  Usage: notl_or_xorl_GMP_NUMB_MASK(reg)
+dnl
+dnl  Expand to either "notl `reg'" or "xorl $GMP_NUMB_BITS,`reg'" as
+dnl  appropriate for nails in use or not.
+
+define(notl_or_xorl_GMP_NUMB_MASK,
+m4_assert_numargs(1)
+`ifelse(GMP_NAIL_BITS,0,
+`notl	`$1'',
+`xorl	$GMP_NUMB_MASK, `$1'')')
+
+
+dnl  Usage LEA(symbol,reg)
+dnl  Usage LEAL(symbol_local_to_file,reg)
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',`dnl
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')dnl
+	call	L(movl_eip_`'substr($2,1))
+	addl	$_GLOBAL_OFFSET_TABLE_, $2
+	movl	$1@GOT($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',`dnl
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')dnl
+	call	L(movl_eip_`'substr($2,1))
+	addl	$_GLOBAL_OFFSET_TABLE_, $2
+	leal	$1@GOTOFF($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+dnl ASM_END
+
+define(`ASM_END',`load_eip')
+
+define(`load_eip', `')		dnl updated in LEA/LEAL
+
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+	`RODATA
+	ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+`	SIZE(`$1',.-`$1')')
+
+dnl  Usage: CALL(funcname)
+dnl
+
+define(`CALL',
+m4_assert_numargs(1)
+`ifdef(`PIC',
+  `call	GSYM_PREFIX`'$1@PLT',
+  `call	GSYM_PREFIX`'$1')')
+
+ifdef(`PIC',
+`define(`PIC_WITH_EBX')',
+`undefine(`PIC_WITH_EBX')')
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/x86/zn1/gmp-mparam.h b/third_party/gmp/mpn/x86/zn1/gmp-mparam.h
new file mode 100644
index 0000000..8e6c052
--- /dev/null
+++ b/third_party/gmp/mpn/x86/zn1/gmp-mparam.h

@@ -0,0 +1,220 @@
+/* AMD zn1/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3700-4300 MHz Pinnacle Ridge */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 14.00% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define DIV_1_VS_MUL_1_PERCENT             248
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                91
+#define MUL_TOOM44_THRESHOLD               137
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     103
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     105
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 63
+#define SQR_TOOM3_THRESHOLD                 98
+#define SQR_TOOM4_THRESHOLD                172
+#define SQR_TOOM6_THRESHOLD                286
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             64
+
+#define MULMOD_BNM1_THRESHOLD               21
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             606  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    606, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     27, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543, 8}, {   1087,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671, 9}, {   1343,11}, {    351,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,10}, {   1919,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,10}, {   2943,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2495,13}, {   1279,12}, {   2623,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,12}, {   3839,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,12}, {   7935,11}, {  15871,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7935,12}, {  15871,16} }
+#define MUL_FFT_TABLE3_SIZE 172
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             464  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    464, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,11}, {    159, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    959,11}, {   1919,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,12}, {   7679,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,13}, \
+    {   7935,16} }
+#define SQR_FFT_TABLE3_SIZE 173
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  60
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                 161
+#define SQRLO_SQR_THRESHOLD               9335
+
+#define DC_DIV_QR_THRESHOLD                 71
+#define DC_DIVAPPR_Q_THRESHOLD             206
+#define DC_BDIV_QR_THRESHOLD                63
+#define DC_BDIV_Q_THRESHOLD                126
+
+#define INV_MULMOD_BNM1_THRESHOLD           78
+#define INV_NEWTON_THRESHOLD               274
+#define INV_APPR_THRESHOLD                 228
+
+#define BINV_NEWTON_THRESHOLD              274
+#define REDC_1_TO_REDC_N_THRESHOLD          71
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1589
+
+#define POWM_SEC_TABLE  3,28,54,386,1337
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               262
+#define SET_STR_PRECOMPUTE_THRESHOLD       558
+
+#define FAC_DSC_THRESHOLD                  109
+#define FAC_ODD_THRESHOLD                   39
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    1  /* 7.49% faster than 3 */
+#define HGCD_THRESHOLD                      74
+#define HGCD_APPR_THRESHOLD                 70
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   440
+#define GCDEXT_DC_THRESHOLD                327
+#define JACOBI_BASE_METHOD                   1  /* 11.98% faster than 3 */
+
+/* Tuneup completed successfully, took 36916 seconds */

diff --git a/third_party/gmp/mpn/x86/zn2/gmp-mparam.h b/third_party/gmp/mpn/x86/zn2/gmp-mparam.h
new file mode 100644
index 0000000..152e6b7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/zn2/gmp-mparam.h

@@ -0,0 +1,226 @@
+/* AMD zn2/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4400 MHz Matisse */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 4.78% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD               7
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           23
+
+#define DIV_1_VS_MUL_1_PERCENT             274
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                85
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               290
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 26
+#define SQR_TOOM3_THRESHOLD                153
+#define SQR_TOOM4_THRESHOLD                214
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             48
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               24
+
+#define MUL_FFT_MODF_THRESHOLD             444  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    444, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159,11}, {     95,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671, 8}, {   1343,10}, {    351, 9}, {    703,10}, \
+    {    367, 9}, {    735,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671, 9}, \
+    {   1343,11}, {    351,10}, {    735,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,11}, \
+    {    447,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471, 9}, \
+    {   2943,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    959,10}, {   1919,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,10}, {   2943,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    959,11}, {   1919,10}, {   3839,14}, \
+    {    255,13}, {    511,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,10}, {   5887,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,11}, {   3839,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2943,11}, {   5887,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3839,12}, {   7679,13}, {   3967,12}, {   7935,11}, \
+    {  15871,15}, {   1023,14}, {   2047,13}, {   4351,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,13}, {   7935,12}, \
+    {  15871,16} }
+#define MUL_FFT_TABLE3_SIZE 189
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    404, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     47,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     39, 8}, {     79, 9}, {     47,10}, {     31, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     95,11}, {     63,10}, {    127, 9}, \
+    {    255, 8}, {    511, 9}, {    271,10}, {    143, 9}, \
+    {    287, 8}, {    607, 7}, {   1215,11}, {     95,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543, 8}, {   1087, 9}, {    607, 8}, \
+    {   1215,11}, {    159, 9}, {    671, 8}, {   1343,10}, \
+    {    351, 9}, {    735, 8}, {   1471,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,10}, \
+    {    607, 9}, {   1215, 8}, {   2431,10}, {    671, 9}, \
+    {   1343,10}, {    735, 9}, {   1471,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215, 9}, {   2431,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471, 9}, {   2943,12}, \
+    {    383,11}, {    863,12}, {    447,11}, {    959,10}, \
+    {   1919,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,10}, {   2943, 9}, \
+    {   5887,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,11}, {   1919,10}, {   3839,14}, \
+    {    255,13}, {    511,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,10}, {   5887,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,11}, {   5887,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3839,15}, \
+    {    511,14}, {   1023,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3839,12}, {   7679,13}, {   3967,12}, \
+    {   7935,11}, {  15871,15}, {   1023,14}, {   2047,13}, \
+    {   4223,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,13}, \
+    {   7935,12}, {  15871,16} }
+#define SQR_FFT_TABLE3_SIZE 178
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             4
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                 107
+#define SQRLO_SQR_THRESHOLD               6633
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             206
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                136
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               212
+#define INV_APPR_THRESHOLD                 204
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1442
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD               97
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  1,16,96,386,1555
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               303
+#define SET_STR_PRECOMPUTE_THRESHOLD       748
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   55
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    1  /* 14.03% faster than 3 */
+#define HGCD_THRESHOLD                     103
+#define HGCD_APPR_THRESHOLD                127
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   396
+#define GCDEXT_DC_THRESHOLD                265
+#define JACOBI_BASE_METHOD                   1  /* 47.88% faster than 4 */
+
+/* Tuneup completed successfully, took 29014 seconds */
commit	bb1338cd84d865f1eb7a653969204a06bba8261c	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Sat Jun 15 19:31:16 2024 -0700
committer	Austin Schuh <austin.linux@gmail.com>	Wed Jun 19 19:49:35 2024 -0700
tree	e291dbf975ebfaebab464c64131d63191b8b0e39
parent	e4a8c6c24f636a763b512a3f87dee2225762d817 [diff]