Add libgmp 6.2.0 to third_party Don't build it yet. That will come in the next review. Change-Id: Idf3266558165e5ab45f4a41c98cc8c838c8244d5

commit: dace2a60b36477ef4295dea0cd01ae43a65f986f [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Tue Aug 18 10:56:48 2020 -0700
committer: Austin Schuh <austin.linux@gmail.com> Wed Aug 19 16:14:33 2020 -0700
tree: f6c709631ecad56062e08308ae06dd1425613c36
parent: 1b6c20f6a7928823e49afda98bf720d99c5689ba [diff]
diff --git a/third_party/gmp/mpn/pa64/README b/third_party/gmp/mpn/pa64/README
new file mode 100644
index 0000000..a51ce02
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/README

@@ -0,0 +1,78 @@
+Copyright 1999, 2001, 2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+This directory contains mpn functions for 64-bit PA-RISC 2.0.
+
+PIPELINE SUMMARY
+
+The PA8x00 processors have an orthogonal 4-way out-of-order pipeline.  Each
+cycle two ALU operations and two MEM operations can issue, but just one of the
+MEM operations may be a store.  The two ALU operations can be almost any
+combination of non-memory operations.  Unlike every other processor, integer
+and fp operations are completely equal here; they both count as just ALU
+operations.
+
+Unfortunately, some operations cause hickups in the pipeline.  Combining
+carry-consuming operations like ADD,DC with operations that does not set carry
+like ADD,L cause long delays.  Skip operations also seem to cause hickups.  If
+several ADD,DC are issued consecutively, or if plain carry-generating ADD feed
+ADD,DC, stalling does not occur.  We can effectively issue two ADD,DC
+operations/cycle.
+
+Latency scheduling is not as important as making sure to have a mix of ALU and
+MEM operations, but for full pipeline utilization, it is still a good idea to
+do some amount of latency scheduling.
+
+Like for all other processors, RAW memory scheduling is critically important.
+Since integer multiplication takes place in the floating-point unit, the GMP
+code needs to handle this problem frequently.
+
+STATUS
+
+* mpn_lshift and mpn_rshift run at 1.5 cycles/limb on PA8000 and at 1.0
+  cycles/limb on PA8500.  With latency scheduling, the numbers could
+  probably be improved to 1.0 cycles/limb for all PA8x00 chips.
+
+* mpn_add_n and mpn_sub_n run at 2.0 cycles/limb on PA8000 and at about
+  1.6875 cycles/limb on PA8500.  With latency scheduling, this could
+  probably be improved to get close to 1.5 cycles/limb.  A problem is the
+  stalling of carry-inputting instructions after instructions that do not
+  write to carry.
+
+* mpn_mul_1, mpn_addmul_1, and mpn_submul_1 run at between 5.625 and 6.375
+  on PA8500 and later, and about a cycle/limb slower on older chips.  The
+  code uses ADD,DC for adjacent limbs, and relies heavily on reordering.
+
+
+REFERENCES
+
+Hewlett Packard, "64-Bit Runtime Architecture for PA-RISC 2.0", version 3.3,
+October 1997.

diff --git a/third_party/gmp/mpn/pa64/addmul_1.asm b/third_party/gmp/mpn/pa64/addmul_1.asm
new file mode 100644
index 0000000..2cb9af9
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/addmul_1.asm

@@ -0,0 +1,693 @@
+dnl  HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		7
+C 8500,8600,8700:	6.375
+
+C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
+C  could be saved there per call.
+
+C  DESCRIPTION:
+C  The main loop "BIG" is 4-way unrolled, mainly to allow
+C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
+C  registers to the IU registers, have demanded a deep software pipeline, and
+C  a lot of stack slots for partial products in flight.
+C
+C  CODE STRUCTURE:
+C  save-some-registers
+C  do 0, 1, 2, or 3 limbs
+C  if done, restore-some-regs and return
+C  save-many-regs
+C  do 4, 8, ... limb
+C  restore-all-regs
+
+C  STACK LAYOUT:
+C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
+C  slots marked FREE, as well as some slots in the caller's "frame marker".
+C
+C -00 <- r30
+C -08  FREE
+C -10  tmp
+C -18  tmp
+C -20  tmp
+C -28  tmp
+C -30  tmp
+C -38  tmp
+C -40  tmp
+C -48  tmp
+C -50  tmp
+C -58  tmp
+C -60  tmp
+C -68  tmp
+C -70  tmp
+C -78  tmp
+C -80  tmp
+C -88  tmp
+C -90  FREE
+C -98  FREE
+C -a0  FREE
+C -a8  FREE
+C -b0  r13
+C -b8  r12
+C -c0  r11
+C -c8  r10
+C -d0  r8
+C -d8  r8
+C -e0  r7
+C -e8  r6
+C -f0  r5
+C -f8  r4
+C -100 r3
+C  Previous frame:
+C  [unused area]
+C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS:
+define(`rp',`%r26')	C
+define(`up',`%r25')	C
+define(`n',`%r24')	C
+define(`vlimb',`%r23')	C
+
+define(`climb',`%r23')	C
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_addmul_1)
+
+ifdef(`HAVE_ABI_2_0w',
+`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
+')
+	std,ma		%r3, 0x100(%r30)
+	std		%r4, -0xf8(%r30)
+	std		%r5, -0xf0(%r30)
+	ldo		0(%r0), climb		C clear climb
+	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+
+define(`m032',`%r20')	C
+define(`m096',`%r21')	C
+
+define(`p000a',`%r22')	C
+define(`p064a',`%r29')	C
+
+define(`s000',`%r31')	C
+
+define(`ma000',`%r4')	C
+define(`ma064',`%r20')	C
+
+define(`r000',`%r3')	C
+
+	extrd,u		n, 63, 2, %r5
+	cmpb,=		%r5, %r0, L(BIG)
+	nop
+
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	addib,<>	-1, %r5, L(two_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(one)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x80(%r30), p000a
+	b		L(0_one_out)
+	ldd		-0x68(%r30), p064a
+
+LDEF(two_or_more)
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	ldd		-0x68(%r30), p064a
+	addib,<>	-1, %r5, L(three_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(two)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	b		L(0_two_out)
+	depd		m096, 31, 32, ma064
+
+LDEF(three_or_more)
+	fldd		0(up), %fr4
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+C	addib,=		-1, %r5, L(0_out)
+	depd		m096, 31, 32, ma064
+LDEF(loop0)
+C	xmpyu		%fr8R, %fr4L, %fr22
+C	xmpyu		%fr8L, %fr4R, %fr23
+C	ldd		-0x78(%r30), p032a1
+C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+C
+C	xmpyu		%fr8R, %fr4R, %fr24
+C	xmpyu		%fr8L, %fr4L, %fr25
+C	ldd		-0x70(%r30), p032a2
+C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+C
+C	ldo		8(rp), rp
+C	add		climb, p000a, s000
+C	ldd		-0x80(%r30), p000a
+C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+C
+C	add,dc		p064a, %r0, climb
+C	ldo		8(up), up
+C	ldd		-0x68(%r30), p064a
+C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+C
+C	add		ma000, s000, s000
+C	add,dc		ma064, climb, climb
+C	fldd		0(up), %fr4
+C
+C	add		r000, s000, s000
+C	add,dc		%r0, climb, climb
+C	std		s000, -8(rp)
+C
+C	add		p032a1, p032a2, m032
+C	add,dc		%r0, %r0, m096
+C
+C	depd,z		m032, 31, 32, ma000
+C	extrd,u		m032, 31, 32, ma064
+C	ldd		0(rp), r000
+C	addib,<>	-1, %r5, L(loop0)
+C	depd		m096, 31, 32, ma064
+LDEF(0_out)
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	add		r000, s000, s000
+	add,dc		%r0, climb, climb
+	std		s000, -8(rp)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+LDEF(0_two_out)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	add		r000, s000, s000
+	add,dc		%r0, climb, climb
+	std		s000, -8(rp)
+LDEF(0_one_out)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+
+	add		climb, p000a, s000
+	add,dc		p064a, %r0, climb
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	add		r000, s000, s000
+	add,dc		%r0, climb, climb
+	std		s000, 0(rp)
+
+	cmpib,>=	4, n, L(done)
+	ldo		8(rp), rp
+
+C 4-way unrolled code.
+
+LDEF(BIG)
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+define(`p096b1',`%r20')	C
+define(`p096b2',`%r21')	C
+define(`p160c1',`%r22')	C
+define(`p160c2',`%r29')	C
+define(`p224d1',`%r31')	C
+define(`p224d2',`%r3')	C
+			C
+define(`m032',`%r4')	C
+define(`m096',`%r5')	C
+define(`m160',`%r6')	C
+define(`m224',`%r7')	C
+define(`m288',`%r8')	C
+			C
+define(`p000a',`%r1')	C
+define(`p064a',`%r19')	C
+define(`p064b',`%r20')	C
+define(`p128b',`%r21')	C
+define(`p128c',`%r22')	C
+define(`p192c',`%r29')	C
+define(`p192d',`%r31')	C
+define(`p256d',`%r3')	C
+			C
+define(`s000',`%r10')	C
+define(`s064',`%r11')	C
+define(`s128',`%r12')	C
+define(`s192',`%r13')	C
+			C
+define(`ma000',`%r9')	C
+define(`ma064',`%r4')	C
+define(`ma128',`%r5')	C
+define(`ma192',`%r6')	C
+define(`ma256',`%r7')	C
+			C
+define(`r000',`%r1')	C
+define(`r064',`%r19')	C
+define(`r128',`%r20')	C
+define(`r192',`%r21')	C
+
+	std		%r6, -0xe8(%r30)
+	std		%r7, -0xe0(%r30)
+	std		%r8, -0xd8(%r30)
+	std		%r9, -0xd0(%r30)
+	std		%r10, -0xc8(%r30)
+	std		%r11, -0xc0(%r30)
+	std		%r12, -0xb8(%r30)
+	std		%r13, -0xb0(%r30)
+
+ifdef(`HAVE_ABI_2_0w',
+`	extrd,u		n, 61, 62, n		C right shift 2
+',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
+')
+
+LDEF(4_or_more)
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,<>	-1, n, L(8_or_more)
+	xmpyu		%fr8L, %fr7L, %fr27
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	b		L(end1)
+	nop
+
+LDEF(8_or_more)
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,=		-1, n, L(end2)
+	xmpyu		%fr8L, %fr7L, %fr27
+LDEF(loop)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+
+	add,dc		ma128, s128, s128	C accum mid 2
+	fldd		0(up), %fr4
+	add,dc		ma192, s192, s192	C accum mid 3
+	fldd		8(up), %fr5
+
+	add,dc		ma256, climb, climb
+	fldd		16(up), %fr6
+	add		r000, s000, s000	C accum rlimb 0
+	fldd		24(up), %fr7
+
+	add,dc		r064, s064, s064	C accum rlimb 1
+	add,dc		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+
+	add,dc		r192, s192, s192	C accum rlimb 3
+	add,dc		%r0, climb, climb
+	std		s064, 8(rp)
+
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	std		s128, 16(rp)
+
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	std		s192, 24(rp)
+
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	xmpyu		%fr8L, %fr7L, %fr27
+
+	addib,<>	-1, n, L(loop)
+	ldo		32(rp), rp
+
+LDEF(end2)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	add		r000, s000, s000	C accum rlimb 0
+	add,dc		r064, s064, s064	C accum rlimb 1
+	add,dc		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	add,dc		r192, s192, s192	C accum rlimb 3
+	add,dc		%r0, climb, climb
+	std		s064, 8(rp)
+	ldd		-0x78(%r30), p032a1
+	std		s128, 16(rp)
+	ldd		-0x70(%r30), p032a2
+	std		s192, 24(rp)
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	ldo		32(rp), rp
+
+LDEF(end1)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	add		r000, s000, s000	C accum rlimb 0
+	add,dc		r064, s064, s064	C accum rlimb 1
+	add,dc		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	add,dc		r192, s192, s192	C accum rlimb 3
+	add,dc		%r0, climb, climb
+	std		s064, 8(rp)
+	std		s128, 16(rp)
+	std		s192, 24(rp)
+
+	ldd		-0xb0(%r30), %r13
+	ldd		-0xb8(%r30), %r12
+	ldd		-0xc0(%r30), %r11
+	ldd		-0xc8(%r30), %r10
+	ldd		-0xd0(%r30), %r9
+	ldd		-0xd8(%r30), %r8
+	ldd		-0xe0(%r30), %r7
+	ldd		-0xe8(%r30), %r6
+LDEF(done)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		climb, %r28
+',`	extrd,u		climb, 63, 32, %r29
+	extrd,u		climb, 31, 32, %r28
+')
+	ldd		-0xf0(%r30), %r5
+	ldd		-0xf8(%r30), %r4
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/pa64/aors_n.asm b/third_party/gmp/mpn/pa64/aors_n.asm
new file mode 100644
index 0000000..ab4536f
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/aors_n.asm

@@ -0,0 +1,130 @@
+dnl  HP-PA 2.0 mpn_add_n, mpn_sub_n
+
+dnl  Copyright 1997, 2000, 2002, 2003, 2009, 2010 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This runs at 2 cycles/limb on PA8000 and 1.6875 cycles/limb on PA8500.  It
+dnl  should be possible to reach the cache bandwidth 1.5 cycles/limb at least
+dnl  with PA8500.  The problem now is stalling of the first ADD,DC after LDO,
+dnl  where the processor gets confused about where carry comes from.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`vp',`%r24')
+define(`n',`%r23')
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBC,	      `add,dc')
+	define(INITCY,	      `addi -1,%r22,%r0')
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBC,	      `sub,db')
+	define(INITCY,	      `subi 0,%r22,%r0')
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ifdef(`HAVE_ABI_2_0w',
+`       .level  2.0w
+',`     .level  2.0
+')
+PROLOGUE(func_nc)
+ifdef(`HAVE_ABI_2_0w',
+`	b		L(com)
+	nop
+',`	b		L(com)
+	ldw		-52(%r30), %r22
+')
+EPILOGUE()
+PROLOGUE(func)
+	ldi		0, %r22
+LDEF(com)
+	sub		%r0, n, %r21
+	depw,z		%r21, 30, 3, %r28	C r28 = 2 * (-n & 7)
+	depw,z		%r21, 28, 3, %r21	C r21 = 8 * (-n & 7)
+	sub		up, %r21, up		C offset up
+	sub		vp, %r21, vp		C offset vp
+	sub		rp, %r21, rp		C offset rp
+	blr		%r28, %r0		C branch into loop
+	INITCY
+
+LDEF(loop)
+	ldd		0(up), %r20
+	ldd		0(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 0(rp)
+LDEF(7)	ldd		8(up), %r21
+	ldd		8(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	std		%r21, 8(rp)
+LDEF(6)	ldd		16(up), %r20
+	ldd		16(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 16(rp)
+LDEF(5)	ldd		24(up), %r21
+	ldd		24(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	std		%r21, 24(rp)
+LDEF(4)	ldd		32(up), %r20
+	ldd		32(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 32(rp)
+LDEF(3)	ldd		40(up), %r21
+	ldd		40(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	std		%r21, 40(rp)
+LDEF(2)	ldd		48(up), %r20
+	ldd		48(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 48(rp)
+LDEF(1)	ldd		56(up), %r21
+	ldd		56(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	ldo		64(up), up
+	std		%r21, 56(rp)
+	ldo		64(vp), vp
+	addib,>		-8, n, L(loop)
+	ldo		64(rp), rp
+
+	add,dc		%r0, %r0, %r29
+ifdef(`OPERATION_sub_n',`
+	subi		1, %r29, %r29
+')
+	bve		(%r2)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		%r29, %r28
+',`	ldi		0, %r28
+')
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa64/aorslsh1_n.asm b/third_party/gmp/mpn/pa64/aorslsh1_n.asm
new file mode 100644
index 0000000..2a55dde
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/aorslsh1_n.asm

@@ -0,0 +1,228 @@
+dnl  PA64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl  Copyright 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		2
+C 8500,8600,8700:	1.75
+
+C TODO
+C  * Write special feed-in code for each (n mod 8). (See the ia64 code.)
+C  * Try to make this run at closer to 1.5 c/l.
+C  * Set up register aliases (define(`u0',`%r19')).
+C  * Explicitly align loop.
+
+dnl INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`vp',`%r24')
+define(`n',`%r23')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADCSBC,	`add,dc')
+  define(INITC,		`ldi	0,')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+  define(ADCSBC,	`sub,db')
+  define(INITC,		`ldi	1,')
+  define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ifdef(`HAVE_ABI_2_0w',`
+  define(LEVEL,		`.level 2.0w')
+  define(RETREG,	`%r28')
+  define(CLRRET1,	`dnl')
+')
+ifdef(`HAVE_ABI_2_0n',`
+  define(LEVEL,		`.level 2.0')
+  define(RETREG,	`%r29')
+  define(CLRRET1,	`ldi	0, %r28')
+')
+
+	LEVEL
+PROLOGUE(func)
+	std,ma		%r3, 0x100(%r30)	C save reg
+
+	INITC		%r1			C init saved cy
+
+C Primitive code for the first (n mod 8) limbs:
+	extrd,u		n, 63, 3, %r22		C count for loop0
+	comib,=		0, %r22, L(unrolled)	C skip loop0?
+	copy		%r0, %r28
+LDEF(loop0)
+	ldd	0(vp), %r21
+	ldo		8(vp), vp
+	ldd	0(up), %r19
+	ldo		8(up), up
+	shrpd	%r21, %r28, 63, %r31
+	addi		-1, %r1, %r0		C restore cy
+	ADCSBC	%r19, %r31, %r29
+	std	%r29, 0(rp)
+	add,dc		%r0, %r0, %r1		C save cy
+	copy	%r21, %r28
+	addib,>		-1, %r22, L(loop0)
+	ldo		8(rp), rp
+
+	addib,>=	-8, n, L(unrolled)
+	addi		-1, %r1, %r0		C restore cy
+
+	shrpd	%r0, %r28, 63, %r28
+	ADCSBC	%r0, %r28, RETREG
+ifdef(`OPERATION_sublsh1_n',
+`	sub	%r0, RETREG, RETREG')
+	CLRRET1
+
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+
+
+LDEF(unrolled)
+	std		%r4, -0xf8(%r30)	C save reg
+	ldd	0(vp), %r4
+	std		%r5, -0xf0(%r30)	C save reg
+	ldd	8(vp), %r5
+	std		%r6, -0xe8(%r30)	C save reg
+	ldd	16(vp), %r6
+	std		%r7, -0xe0(%r30)	C save reg
+
+	ldd	24(vp), %r7
+	shrpd	%r4, %r28, 63, %r31
+	std		%r8, -0xd8(%r30)	C save reg
+	ldd	32(vp), %r8
+	shrpd	%r5, %r4, 63, %r4
+	std		%r9, -0xd0(%r30)	C save reg
+	ldd	40(vp), %r9
+	shrpd	%r6, %r5, 63, %r5
+	ldd	48(vp), %r3
+	shrpd	%r7, %r6, 63, %r6
+	ldd	56(vp), %r28
+	shrpd	%r8, %r7, 63, %r7
+	ldd	0(up), %r19
+	shrpd	%r9, %r8, 63, %r8
+	ldd	8(up), %r20
+	shrpd	%r3, %r9, 63, %r9
+	ldd	16(up), %r21
+	shrpd	%r28, %r3, 63, %r3
+	ldd	24(up), %r22
+
+	nop					C alignment FIXME
+	addib,<=	-8, n, L(end)
+	addi		-1, %r1, %r0		C restore cy
+LDEF(loop)
+	ADCSBC	%r19, %r31, %r29
+	ldd	32(up), %r19
+	std	%r29, 0(rp)
+	ADCSBC	%r20, %r4, %r29
+	ldd	40(up), %r20
+	std	%r29, 8(rp)
+	ADCSBC	%r21, %r5, %r29
+	ldd	48(up), %r21
+	std	%r29, 16(rp)
+	ADCSBC	%r22, %r6, %r29
+	ldd	56(up), %r22
+	std	%r29, 24(rp)
+	ADCSBC	%r19, %r7, %r29
+	ldd	64(vp), %r4
+	std	%r29, 32(rp)
+	ADCSBC	%r20, %r8, %r29
+	ldd	72(vp), %r5
+	std	%r29, 40(rp)
+	ADCSBC	%r21, %r9, %r29
+	ldd	80(vp), %r6
+	std	%r29, 48(rp)
+	ADCSBC	%r22, %r3, %r29
+	std	%r29, 56(rp)
+
+	add,dc		%r0, %r0, %r1		C save cy
+
+	ldd	88(vp), %r7
+	shrpd	%r4, %r28, 63, %r31
+	ldd	96(vp), %r8
+	shrpd	%r5, %r4, 63, %r4
+	ldd	104(vp), %r9
+	shrpd	%r6, %r5, 63, %r5
+	ldd	112(vp), %r3
+	shrpd	%r7, %r6, 63, %r6
+	ldd	120(vp), %r28
+	shrpd	%r8, %r7, 63, %r7
+	ldd	64(up), %r19
+	shrpd	%r9, %r8, 63, %r8
+	ldd	72(up), %r20
+	shrpd	%r3, %r9, 63, %r9
+	ldd	80(up), %r21
+	shrpd	%r28, %r3, 63, %r3
+	ldd	88(up), %r22
+
+	ldo		64(vp), vp
+	ldo		64(rp), rp
+	ldo		64(up), up
+	addib,>		-8, n, L(loop)
+	addi		-1, %r1, %r0		C restore cy
+LDEF(end)
+	ADCSBC	%r19, %r31, %r29
+	ldd	32(up), %r19
+	std	%r29, 0(rp)
+	ADCSBC	%r20, %r4, %r29
+	ldd	40(up), %r20
+	std	%r29, 8(rp)
+	ADCSBC	%r21, %r5, %r29
+	ldd	48(up), %r21
+	std	%r29, 16(rp)
+	ADCSBC	%r22, %r6, %r29
+	ldd	56(up), %r22
+	std	%r29, 24(rp)
+	ADCSBC	%r19, %r7, %r29
+	ldd		-0xf8(%r30), %r4	C restore reg
+	std	%r29, 32(rp)
+	ADCSBC	%r20, %r8, %r29
+	ldd		-0xf0(%r30), %r5	C restore reg
+	std	%r29, 40(rp)
+	ADCSBC	%r21, %r9, %r29
+	ldd		-0xe8(%r30), %r6	C restore reg
+	std	%r29, 48(rp)
+	ADCSBC	%r22, %r3, %r29
+	ldd		-0xe0(%r30), %r7	C restore reg
+	std	%r29, 56(rp)
+
+	shrpd	%r0, %r28, 63, %r28
+	ldd		-0xd8(%r30), %r8	C restore reg
+	ADCSBC	%r0, %r28, RETREG
+ifdef(`OPERATION_sublsh1_n',
+`	sub	%r0, RETREG, RETREG')
+	CLRRET1
+
+	ldd		-0xd0(%r30), %r9	C restore reg
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3	C restore reg
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa64/gmp-mparam.h b/third_party/gmp/mpn/pa64/gmp-mparam.h
new file mode 100644
index 0000000..c2719c3
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/gmp-mparam.h

@@ -0,0 +1,247 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004, 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 440MHz PA8200 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        14
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD              21
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                31
+#define MUL_TOOM33_THRESHOLD               114
+#define MUL_TOOM44_THRESHOLD               179
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               296
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     130
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     229
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     129
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      54
+
+#define SQR_BASECASE_THRESHOLD               5
+#define SQR_TOOM2_THRESHOLD                 58
+#define SQR_TOOM3_THRESHOLD                153
+#define SQR_TOOM4_THRESHOLD                278
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             56
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define POWM_SEC_TABLE  2,23,228,1084
+
+#define MUL_FFT_MODF_THRESHOLD             336  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    336, 5}, {     11, 4}, {     23, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     19, 7}, {     39, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47, 9}, {     95,10}, {     55,11}, {     31,10}, \
+    {     63, 9}, {    127,10}, {     71, 8}, {    287,10}, \
+    {     79,11}, {     47,10}, {     95, 9}, {    191, 8}, \
+    {    383, 7}, {    767,10}, {    103, 9}, {    207, 8}, \
+    {    415, 7}, {    831,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    543, 7}, {   1087, 6}, \
+    {   2175,10}, {    143, 9}, {    287, 8}, {    575,11}, \
+    {     79, 9}, {    319, 8}, {    639, 7}, {   1279, 9}, \
+    {    335, 8}, {    671,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415, 8}, {    831, 7}, \
+    {   1663,11}, {    111,10}, {    223, 9}, {    447, 8}, \
+    {    895,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    543, 8}, {   1087, 7}, {   2175,10}, {    287, 9}, \
+    {    575, 8}, {   1215, 7}, {   2431,10}, {    319, 9}, \
+    {    639, 8}, {   1279,10}, {    335, 9}, {    671, 8}, \
+    {   1343, 9}, {    703, 8}, {   1407,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207, 9}, {    831, 8}, \
+    {   1663,11}, {    223,10}, {    447, 9}, {    959,13}, \
+    {     63,12}, {    127,11}, {    255, 8}, {   2047,11}, \
+    {    271,10}, {    543, 9}, {   1087, 8}, {   2175,11}, \
+    {    287,10}, {    575, 9}, {   1215, 8}, {   2431,11}, \
+    {    319,10}, {    671, 9}, {   1343, 8}, {   2687,11}, \
+    {    351,10}, {    703, 9}, {   1471, 8}, {   2943,12}, \
+    {    191,11}, {    383, 8}, {   3071,11}, {    415,10}, \
+    {    831, 9}, {   1663,11}, {    479,10}, {    959, 9}, \
+    {   1919, 8}, {   3839,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087, 9}, {   2175,12}, {    287,11}, \
+    {    607,10}, {   1215, 9}, {   2431, 8}, {   4863,12}, \
+    {    319,11}, {    671,10}, {   1343,13}, {    191, 9}, \
+    {   3071,12}, {    415,11}, {    831,10}, {   1663, 8}, \
+    {   6655, 9}, {   3455,12}, {    447, 9}, {   3583,13}, \
+    {    255,12}, {    511,11}, {   1023,10}, {   2175,13}, \
+    {    319,11}, {   1279,12}, {    671,10}, {   2815,12}, \
+    {    735,10}, {   2943, 9}, {   5887,13}, {    383,12}, \
+    {    767,11}, {   1535,10}, {   3071,13}, {    447,10}, \
+    {   3583,12}, {    959,13}, {    511,12}, {   1087,13}, \
+    {    639,12}, {   1343,13}, {    767,11}, {   3071,13}, \
+    {    831,12}, {   1663,11}, {   3455,10}, {   6911,13}, \
+    {    895,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2303,13}, {   1215,12}, {   2431,14}, \
+    {    639,13}, {   1279,12}, {   2559,13}, {   1343,12}, \
+    {   2687,11}, {   5375,13}, {   1407,12}, {   2815,11}, \
+    {   5631,12}, {   2943,13}, {   1535,12}, {   3199,13}, \
+    {   1663,12}, {   3327,13}, {   1727,14}, {    895,13}, \
+    {   1791,12}, {   3583,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2047,12}, {   4095,14}, {   1151,13}, \
+    {   2431,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2815,12}, {   5631,15}, {    767,14}, {   1535,13}, \
+    {   3071,14}, {   1663,13}, {   3327,14}, {   1791,13}, \
+    {   3583,14}, {   1919,15}, {   1023,14}, {   2303,13}, \
+    {   4607,14}, {   2431,13}, {   4863,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 252
+#define MUL_FFT_THRESHOLD                 2368
+
+#define SQR_FFT_MODF_THRESHOLD             284  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    284, 5}, {      9, 4}, {     21, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     25, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     63, 8}, {    255, 7}, {    511,10}, \
+    {     71, 8}, {    287, 7}, {    575,10}, {     79,11}, \
+    {     47,10}, {     95, 9}, {    191, 8}, {    383, 7}, \
+    {    767,10}, {    103, 9}, {    207, 8}, {    415,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    543, 7}, {   1087, 8}, {    575, 7}, {   1151,11}, \
+    {     79, 8}, {    639, 7}, {   1279, 9}, {    335, 8}, \
+    {    671, 7}, {   1343,10}, {    175, 8}, {    703, 7}, \
+    {   1407,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415, 8}, {    831, 7}, \
+    {   1663, 9}, {    447, 8}, {    895,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    543, 8}, {   1087, 7}, \
+    {   2175, 9}, {    575, 8}, {   1151,10}, {    303, 9}, \
+    {    607, 8}, {   1215, 7}, {   2431,10}, {    319, 9}, \
+    {    639, 8}, {   1279, 9}, {    671, 8}, {   1343, 7}, \
+    {   2687,10}, {    351, 9}, {    703, 8}, {   1407,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831, 8}, {   1663,11}, \
+    {    223,10}, {    447, 9}, {    895,13}, {     63,11}, \
+    {    255,10}, {    543, 8}, {   2175,11}, {    287,10}, \
+    {    575, 9}, {   1151,10}, {    607, 9}, {   1215, 8}, \
+    {   2431,11}, {    319, 9}, {   1279,10}, {    671, 9}, \
+    {   1343, 8}, {   2687,11}, {    351,10}, {    703, 9}, \
+    {   1407,10}, {    735,12}, {    191,11}, {    383,10}, \
+    {    831, 9}, {   1663,12}, {    223,11}, {    447,10}, \
+    {    895,11}, {    479, 9}, {   1919, 8}, {   3839,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087, 9}, {   2175,12}, {    287,11}, {    575,10}, \
+    {   1151,11}, {    607,10}, {   1215, 9}, {   2431, 8}, \
+    {   4863,10}, {   1279,11}, {    671,10}, {   1343, 9}, \
+    {   2687,12}, {    351,11}, {    703,10}, {   1407,11}, \
+    {    735,13}, {    191, 9}, {   3071, 7}, {  12287,11}, \
+    {    799,12}, {    415,11}, {    831,10}, {   1663,12}, \
+    {    447, 8}, {   7167,12}, {    479, 9}, {   3839,14}, \
+    {    127,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    543,10}, {   2175, 9}, {   4607,11}, {   1215,10}, \
+    {   2431,11}, {   1279,10}, {   2559,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    799,10}, {   3199, 9}, \
+    {   6399,12}, {    895,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,13}, {    575,12}, {   1151,10}, \
+    {   4607,13}, {    639,12}, {   1279,11}, {   2687,14}, \
+    {    383,13}, {    767,11}, {   3071,12}, {   1599,13}, \
+    {    895,12}, {   1791,11}, {   3583,13}, {    959,15}, \
+    {    255,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1279,12}, {   2559,13}, {   1343,12}, {   2687,13}, \
+    {   1471,11}, {   5887,14}, {    767,13}, {   1535,12}, \
+    {   3071,13}, {   1599,12}, {   3199,13}, {   1663,12}, \
+    {   3327,13}, {   1727,14}, {    895,13}, {   1791,12}, \
+    {   3583,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,12}, {   4607,13}, {   2431,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2815,15}, {    767,13}, \
+    {   3199,14}, {   1663,13}, {   3327,14}, {   1791,13}, \
+    {   3583,14}, {   1919,15}, {   1023,14}, {   2047,13}, \
+    {   4095,14}, {   2303,13}, {   4607,14}, {   2431,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 257
+#define SQR_FFT_THRESHOLD                 1856
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 113
+#define MULLO_MUL_N_THRESHOLD             4658
+
+#define DC_DIV_QR_THRESHOLD                123
+#define DC_DIVAPPR_Q_THRESHOLD             372
+#define DC_BDIV_QR_THRESHOLD               142
+#define DC_BDIV_Q_THRESHOLD                312
+
+#define INV_MULMOD_BNM1_THRESHOLD           58
+#define INV_NEWTON_THRESHOLD               315
+#define INV_APPR_THRESHOLD                 315
+
+#define BINV_NEWTON_THRESHOLD              360
+#define REDC_1_TO_REDC_N_THRESHOLD         101
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               93
+#define MU_BDIV_QR_THRESHOLD               889
+#define MU_BDIV_Q_THRESHOLD               1187
+
+#define MATRIX22_STRASSEN_THRESHOLD          9
+#define HGCD_THRESHOLD                     234
+#define HGCD_APPR_THRESHOLD                300
+#define HGCD_REDUCE_THRESHOLD             1553
+#define GCD_DC_THRESHOLD                   684
+#define GCDEXT_DC_THRESHOLD                525
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                21
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD              1951
+#define SET_STR_PRECOMPUTE_THRESHOLD      4034

diff --git a/third_party/gmp/mpn/pa64/lshift.asm b/third_party/gmp/mpn/pa64/lshift.asm
new file mode 100644
index 0000000..c0fc292
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/lshift.asm

@@ -0,0 +1,114 @@
+dnl  HP-PA 2.0 mpn_lshift -- Left shift.
+
+dnl  Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+define(`cnt',`%r23')
+
+ifdef(`HAVE_ABI_2_0w',
+`       .level  2.0w
+',`     .level  2.0
+')
+PROLOGUE(mpn_lshift)
+	shladd		n, 3, up, up
+	shladd		n, 3, rp, rp
+	subi		64, cnt, cnt
+	mtsar		cnt
+	ldd		-8(up), %r21
+	addib,=		-1, n, L(end)
+	shrpd		%r0, %r21, %sar, %r29	C compute carry out limb
+	depw,z		n, 31, 3, %r28		C r28 = (size & 7)
+	sub		%r0, n, %r22
+	depw,z		%r22, 28, 3, %r22	C r22 = 8 * (-size & 7)
+	add		up, %r22, up		C offset up
+	blr		%r28, %r0		C branch into jump table
+	add		rp, %r22, rp		C offset rp
+	b		L(0)
+	nop
+	b		L(1)
+	copy		%r21, %r20
+	b		L(2)
+	nop
+	b		L(3)
+	copy		%r21, %r20
+	b		L(4)
+	nop
+	b		L(5)
+	copy		%r21, %r20
+	b		L(6)
+	nop
+	b		L(7)
+	copy		%r21, %r20
+
+LDEF(loop)
+LDEF(0)	ldd		-16(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -8(rp)
+LDEF(7)	ldd		-24(up), %r21
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -16(rp)
+LDEF(6)	ldd		-32(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -24(rp)
+LDEF(5)	ldd		-40(up), %r21
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -32(rp)
+LDEF(4)	ldd		-48(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -40(rp)
+LDEF(3)	ldd		-56(up), %r21
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -48(rp)
+LDEF(2)	ldd		-64(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -56(rp)
+LDEF(1)	ldd		-72(up), %r21
+	ldo		-64(up), up
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -64(rp)
+	addib,>		-8, n, L(loop)
+	ldo		-64(rp), rp
+
+LDEF(end)
+	shrpd		%r21, %r0, %sar, %r21
+	std		%r21, -8(rp)
+	bve		(%r2)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		%r29,%r28
+',`	extrd,u		%r29, 31, 32, %r28
+')
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/pa64/mul_1.asm b/third_party/gmp/mpn/pa64/mul_1.asm
new file mode 100644
index 0000000..6935c23
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/mul_1.asm

@@ -0,0 +1,646 @@
+dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		6.5
+C 8500,8600,8700:	5.625
+
+C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
+C  could be saved there per call.
+
+C  DESCRIPTION:
+C  The main loop "BIG" is 4-way unrolled, mainly to allow
+C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
+C  registers to the IU registers, have demanded a deep software pipeline, and
+C  a lot of stack slots for partial products in flight.
+C
+C  CODE STRUCTURE:
+C  save-some-registers
+C  do 0, 1, 2, or 3 limbs
+C  if done, restore-some-regs and return
+C  save-many-regs
+C  do 4, 8, ... limb
+C  restore-all-regs
+
+C  STACK LAYOUT:
+C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
+C  slots marked FREE, as well as some slots in the caller's "frame marker".
+C
+C -00 <- r30
+C -08  FREE
+C -10  tmp
+C -18  tmp
+C -20  tmp
+C -28  tmp
+C -30  tmp
+C -38  tmp
+C -40  tmp
+C -48  tmp
+C -50  tmp
+C -58  tmp
+C -60  tmp
+C -68  tmp
+C -70  tmp
+C -78  tmp
+C -80  tmp
+C -88  tmp
+C -90  FREE
+C -98  FREE
+C -a0  FREE
+C -a8  FREE
+C -b0  r13
+C -b8  r12
+C -c0  r11
+C -c8  r10
+C -d0  r8
+C -d8  r8
+C -e0  r7
+C -e8  r6
+C -f0  r5
+C -f8  r4
+C -100 r3
+C  Previous frame:
+C  [unused area]
+C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS:
+define(`rp',`%r26')	C
+define(`up',`%r25')	C
+define(`n',`%r24')	C
+define(`vlimb',`%r23')	C
+
+define(`climb',`%r23')	C
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_mul_1)
+
+ifdef(`HAVE_ABI_2_0w',
+`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
+')
+	std,ma		%r3, 0x100(%r30)
+	std		%r4, -0xf8(%r30)
+	std		%r5, -0xf0(%r30)
+	ldo		0(%r0), climb		C clear climb
+	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+
+define(`m032',`%r20')	C
+define(`m096',`%r21')	C
+
+define(`p000a',`%r22')	C
+define(`p064a',`%r29')	C
+
+define(`s000',`%r31')	C
+
+define(`ma000',`%r4')	C
+define(`ma064',`%r20')	C
+
+C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
+
+	extrd,u		n, 63, 2, %r5
+	cmpb,=		%r5, %r0, L(BIG)
+	nop
+
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	addib,<>	-1, %r5, L(two_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(one)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x80(%r30), p000a
+	b		L(0_one_out)
+	ldd		-0x68(%r30), p064a
+
+LDEF(two_or_more)
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	ldd		-0x68(%r30), p064a
+	addib,<>	-1, %r5, L(three_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(two)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	b		L(0_two_out)
+	depd		m096, 31, 32, ma064
+
+LDEF(three_or_more)
+	fldd		0(up), %fr4
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+C	addib,=		-1, %r5, L(0_out)
+	depd		m096, 31, 32, ma064
+LDEF(loop0)
+C	xmpyu		%fr8R, %fr4L, %fr22
+C	xmpyu		%fr8L, %fr4R, %fr23
+C	ldd		-0x78(%r30), p032a1
+C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+C
+C	xmpyu		%fr8R, %fr4R, %fr24
+C	xmpyu		%fr8L, %fr4L, %fr25
+C	ldd		-0x70(%r30), p032a2
+C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+C
+C	ldo		8(rp), rp
+C	add		climb, p000a, s000
+C	ldd		-0x80(%r30), p000a
+C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+C
+C	add,dc		p064a, %r0, climb
+C	ldo		8(up), up
+C	ldd		-0x68(%r30), p064a
+C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+C
+C	add		ma000, s000, s000
+C	add,dc		ma064, climb, climb
+C	fldd		0(up), %fr4
+C
+C	std		s000, -8(rp)
+C
+C	add		p032a1, p032a2, m032
+C	add,dc		%r0, %r0, m096
+C
+C	depd,z		m032, 31, 32, ma000
+C	extrd,u		m032, 31, 32, ma064
+C	addib,<>	-1, %r5, L(loop0)
+C	depd		m096, 31, 32, ma064
+LDEF(0_out)
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	std		s000, -8(rp)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+LDEF(0_two_out)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	std		s000, -8(rp)
+LDEF(0_one_out)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+
+	add		climb, p000a, s000
+	add,dc		p064a, %r0, climb
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	std		s000, 0(rp)
+
+	cmpib,>=	4, n, L(done)
+	ldo		8(rp), rp
+
+C 4-way unrolled code.
+
+LDEF(BIG)
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+define(`p096b1',`%r20')	C
+define(`p096b2',`%r21')	C
+define(`p160c1',`%r22')	C
+define(`p160c2',`%r29')	C
+define(`p224d1',`%r31')	C
+define(`p224d2',`%r3')	C
+			C
+define(`m032',`%r4')	C
+define(`m096',`%r5')	C
+define(`m160',`%r6')	C
+define(`m224',`%r7')	C
+define(`m288',`%r8')	C
+			C
+define(`p000a',`%r1')	C
+define(`p064a',`%r19')	C
+define(`p064b',`%r20')	C
+define(`p128b',`%r21')	C
+define(`p128c',`%r22')	C
+define(`p192c',`%r29')	C
+define(`p192d',`%r31')	C
+define(`p256d',`%r3')	C
+			C
+define(`s000',`%r10')	C
+define(`s064',`%r11')	C
+define(`s128',`%r12')	C
+define(`s192',`%r13')	C
+			C
+define(`ma000',`%r9')	C
+define(`ma064',`%r4')	C
+define(`ma128',`%r5')	C
+define(`ma192',`%r6')	C
+define(`ma256',`%r7')	C
+
+	std		%r6, -0xe8(%r30)
+	std		%r7, -0xe0(%r30)
+	std		%r8, -0xd8(%r30)
+	std		%r9, -0xd0(%r30)
+	std		%r10, -0xc8(%r30)
+	std		%r11, -0xc0(%r30)
+	std		%r12, -0xb8(%r30)
+	std		%r13, -0xb0(%r30)
+
+ifdef(`HAVE_ABI_2_0w',
+`	extrd,u		n, 61, 62, n		C right shift 2
+',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
+')
+
+LDEF(4_or_more)
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,<>	-1, n, L(8_or_more)
+	xmpyu		%fr8L, %fr7L, %fr27
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	b		L(end1)
+	nop
+
+LDEF(8_or_more)
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,=		-1, n, L(end2)
+	xmpyu		%fr8L, %fr7L, %fr27
+LDEF(loop)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+
+	add,dc		p064a, p064b, s064
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+
+	add,dc		p192c, p192d, s192
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+
+	add		ma000, s000, s000	C accum mid 0
+	fldd		0(up), %fr4
+	add,dc		ma064, s064, s064	C accum mid 1
+	std		s000, 0(rp)
+
+	add,dc		ma128, s128, s128	C accum mid 2
+	fldd		8(up), %fr5
+	add,dc		ma192, s192, s192	C accum mid 3
+	std		s064, 8(rp)
+
+	add,dc		ma256, climb, climb
+	fldd		16(up), %fr6
+	std		s128, 16(rp)
+
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	fldd		24(up), %fr7
+
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	std		s192, 24(rp)
+
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	xmpyu		%fr8L, %fr7L, %fr27
+
+	addib,<>	-1, n, L(loop)
+	ldo		32(rp), rp
+
+LDEF(end2)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	add,dc		p064a, p064b, s064
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	add,dc		p192c, p192d, s192
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	add		ma000, s000, s000	C accum mid 0
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	std		s000, 0(rp)
+	std		s064, 8(rp)
+	ldd		-0x78(%r30), p032a1
+	std		s128, 16(rp)
+	ldd		-0x70(%r30), p032a2
+	std		s192, 24(rp)
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	ldo		32(rp), rp
+
+LDEF(end1)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	add,dc		p064a, p064b, s064
+	add,dc		p128b, p128c, s128
+	add,dc		p192c, p192d, s192
+	add,dc		p256d, %r0, climb
+	add		ma000, s000, s000	C accum mid 0
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	std		s000, 0(rp)
+	std		s064, 8(rp)
+	std		s128, 16(rp)
+	std		s192, 24(rp)
+
+	ldd		-0xb0(%r30), %r13
+	ldd		-0xb8(%r30), %r12
+	ldd		-0xc0(%r30), %r11
+	ldd		-0xc8(%r30), %r10
+	ldd		-0xd0(%r30), %r9
+	ldd		-0xd8(%r30), %r8
+	ldd		-0xe0(%r30), %r7
+	ldd		-0xe8(%r30), %r6
+LDEF(done)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		climb, %r28
+',`	extrd,u		climb, 63, 32, %r29
+	extrd,u		climb, 31, 32, %r28
+')
+	ldd		-0xf0(%r30), %r5
+	ldd		-0xf8(%r30), %r4
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/pa64/rshift.asm b/third_party/gmp/mpn/pa64/rshift.asm
new file mode 100644
index 0000000..cfc242e
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/rshift.asm

@@ -0,0 +1,111 @@
+dnl  HP-PA 2.0 mpn_rshift -- Right shift.
+
+dnl  Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+define(`cnt',`%r23')
+
+ifdef(`HAVE_ABI_2_0w',
+`       .level  2.0w
+',`     .level  2.0
+')
+PROLOGUE(mpn_rshift)
+	mtsar		cnt
+	ldd		0(up), %r21
+	addib,=		-1, n, L(end)
+	shrpd		%r21, %r0, %sar, %r29	C compute carry out limb
+	depw,z		n, 31, 3, %r28		C r28 = (size & 7)
+	sub		%r0, n, %r22
+	depw,z		%r22, 28, 3, %r22	C r22 = 8 * (-size & 7)
+	sub		up, %r22, up		C offset up
+	blr		%r28, %r0		C branch into jump table
+	sub		rp, %r22, rp		C offset rp
+	b		L(0)
+	nop
+	b		L(1)
+	copy		%r21, %r20
+	b		L(2)
+	nop
+	b		L(3)
+	copy		%r21, %r20
+	b		L(4)
+	nop
+	b		L(5)
+	copy		%r21, %r20
+	b		L(6)
+	nop
+	b		L(7)
+	copy		%r21, %r20
+
+LDEF(loop)
+LDEF(0)	ldd		8(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 0(rp)
+LDEF(7)	ldd		16(up), %r21
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 8(rp)
+LDEF(6)	ldd		24(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 16(rp)
+LDEF(5)	ldd		32(up), %r21
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 24(rp)
+LDEF(4)	ldd		40(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 32(rp)
+LDEF(3)	ldd		48(up), %r21
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 40(rp)
+LDEF(2)	ldd		56(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 48(rp)
+LDEF(1)	ldd		64(up), %r21
+	ldo		64(up), up
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 56(rp)
+	addib,>		-8, n, L(loop)
+	ldo		64(rp), rp
+
+LDEF(end)
+	shrpd		%r0, %r21, %sar, %r21
+	std		%r21, 0(rp)
+	bve		(%r2)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		%r29,%r28
+',`	extrd,u		%r29, 31, 32, %r28
+')
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/pa64/sqr_diagonal.asm b/third_party/gmp/mpn/pa64/sqr_diagonal.asm
new file mode 100644
index 0000000..f6fadc9
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/sqr_diagonal.asm

@@ -0,0 +1,191 @@
+dnl  HP-PA 2.0 64-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
+dnl  PA8500.  The cache would saturate at 5 cycles/limb, so there is some room
+dnl  for optimization.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+
+define(`p00',`%r28')
+define(`p32',`%r29')
+define(`p64',`%r31')
+define(`t0',`%r19')
+define(`t1',`%r20')
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_sqr_diagonal)
+	ldo		128(%r30),%r30
+
+	fldds,ma	8(up),%fr8
+	addib,=		-1,n,L(end1)
+	nop
+	fldds,ma	8(up),%fr4
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-120(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	addib,=		-1,n,L(end2)
+	ldo		16(rp),rp
+
+LDEF(loop)
+	fldds,ma	8(up),%fr8		C load next up limb
+	xmpyu		%fr4l,%fr4r,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr4r,%fr4r,%fr5	C multiply in fp regs
+	fstd		%fr5,0(rp)
+	xmpyu		%fr4l,%fr4l,%fr7
+	fstd		%fr7,8(rp)
+	ldd		-120(%r30),p32
+	ldd		-16(rp),p00		C accumulate in int regs
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	addib,=		-1,n,L(exit)
+	ldo		16(rp),rp
+
+	fldds,ma	8(up),%fr4
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-120(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	addib,<>	-1,n,L(loop)
+	ldo		16(rp),rp
+
+LDEF(end2)
+	xmpyu		%fr4l,%fr4r,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr4r,%fr4r,%fr5
+	fstd		%fr5,0(rp)
+	xmpyu		%fr4l,%fr4l,%fr7
+	fstd		%fr7,8(rp)
+	ldd		-120(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	ldo		16(rp),rp
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+
+LDEF(exit)
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-120(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,31,32,t0
+	add		t0,p00,p00
+	extrd,u		p32,31,32,t1
+	add,dc		t1,p64,p64
+	add		t0,p00,p00
+	add,dc		t1,p64,p64
+	std		p00,-16(rp)
+	std		p64,-8(rp)
+	ldo		16(rp),rp
+	ldd		-120(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,31,32,t0
+	add		t0,p00,p00
+	extrd,u		p32,31,32,t1
+	add,dc		t1,p64,p64
+	add		t0,p00,p00
+	add,dc		t1,p64,p64
+	std		p00,-16(rp)
+	std		p64,-8(rp)
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+
+LDEF(end1)
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-128(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	ldo		16(rp),rp
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,31,32,t0
+	add		t0,p00,p00
+	extrd,u		p32,31,32,t1
+	add,dc		t1,p64,p64
+	add		t0,p00,p00
+	add,dc		t1,p64,p64
+	std		p00,-16(rp)
+	std		p64,-8(rp)
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+EPILOGUE(mpn_sqr_diagonal)

diff --git a/third_party/gmp/mpn/pa64/submul_1.asm b/third_party/gmp/mpn/pa64/submul_1.asm
new file mode 100644
index 0000000..f8a1968
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/submul_1.asm

@@ -0,0 +1,700 @@
+dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		7
+C 8500,8600,8700:	6.5
+
+C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
+C  could be saved there per call.
+
+C  DESCRIPTION:
+C  The main loop "BIG" is 4-way unrolled, mainly to allow
+C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
+C  registers to the IU registers, have demanded a deep software pipeline, and
+C  a lot of stack slots for partial products in flight.
+C
+C  CODE STRUCTURE:
+C  save-some-registers
+C  do 0, 1, 2, or 3 limbs
+C  if done, restore-some-regs and return
+C  save-many-regs
+C  do 4, 8, ... limb
+C  restore-all-regs
+
+C  STACK LAYOUT:
+C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
+C  slots marked FREE, as well as some slots in the caller's "frame marker".
+C
+C -00 <- r30
+C -08  FREE
+C -10  tmp
+C -18  tmp
+C -20  tmp
+C -28  tmp
+C -30  tmp
+C -38  tmp
+C -40  tmp
+C -48  tmp
+C -50  tmp
+C -58  tmp
+C -60  tmp
+C -68  tmp
+C -70  tmp
+C -78  tmp
+C -80  tmp
+C -88  tmp
+C -90  FREE
+C -98  FREE
+C -a0  FREE
+C -a8  FREE
+C -b0  r13
+C -b8  r12
+C -c0  r11
+C -c8  r10
+C -d0  r8
+C -d8  r8
+C -e0  r7
+C -e8  r6
+C -f0  r5
+C -f8  r4
+C -100 r3
+C  Previous frame:
+C  [unused area]
+C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS:
+define(`rp',`%r26')	C
+define(`up',`%r25')	C
+define(`n',`%r24')	C
+define(`vlimb',`%r23')	C
+
+define(`climb',`%r23')	C
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_submul_1)
+
+ifdef(`HAVE_ABI_2_0w',
+`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
+')
+	std,ma		%r3, 0x100(%r30)
+	std		%r4, -0xf8(%r30)
+	std		%r5, -0xf0(%r30)
+	ldo		0(%r0), climb		C clear climb
+	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+
+define(`m032',`%r20')	C
+define(`m096',`%r21')	C
+
+define(`p000a',`%r22')	C
+define(`p064a',`%r29')	C
+
+define(`s000',`%r31')	C
+
+define(`ma000',`%r4')	C
+define(`ma064',`%r20')	C
+
+define(`r000',`%r3')	C
+
+	extrd,u		n, 63, 2, %r5
+	cmpb,=		%r5, %r0, L(BIG)
+	nop
+
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	addib,<>	-1, %r5, L(two_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(one)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x80(%r30), p000a
+	b		L(0_one_out)
+	ldd		-0x68(%r30), p064a
+
+LDEF(two_or_more)
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	ldd		-0x68(%r30), p064a
+	addib,<>	-1, %r5, L(three_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(two)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	b		L(0_two_out)
+	depd		m096, 31, 32, ma064
+
+LDEF(three_or_more)
+	fldd		0(up), %fr4
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+C	addib,=		-1, %r5, L(0_out)
+	depd		m096, 31, 32, ma064
+LDEF(loop0)
+C	xmpyu		%fr8R, %fr4L, %fr22
+C	xmpyu		%fr8L, %fr4R, %fr23
+C	ldd		-0x78(%r30), p032a1
+C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+C
+C	xmpyu		%fr8R, %fr4R, %fr24
+C	xmpyu		%fr8L, %fr4L, %fr25
+C	ldd		-0x70(%r30), p032a2
+C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+C
+C	ldo		8(rp), rp
+C	add		climb, p000a, s000
+C	ldd		-0x80(%r30), p000a
+C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+C
+C	add,dc		p064a, %r0, climb
+C	ldo		8(up), up
+C	ldd		-0x68(%r30), p064a
+C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+C
+C	add		ma000, s000, s000
+C	add,dc		ma064, climb, climb
+C	fldd		0(up), %fr4
+C
+C	sub		r000, s000, s000
+C	sub,db		%r0, climb, climb
+C	sub		%r0, climb, climb
+C	std		s000, -8(rp)
+C
+C	add		p032a1, p032a2, m032
+C	add,dc		%r0, %r0, m096
+C
+C	depd,z		m032, 31, 32, ma000
+C	extrd,u		m032, 31, 32, ma064
+C	ldd		0(rp), r000
+C	addib,<>	-1, %r5, L(loop0)
+C	depd		m096, 31, 32, ma064
+LDEF(0_out)
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	sub		r000, s000, s000
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s000, -8(rp)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+LDEF(0_two_out)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	sub		r000, s000, s000
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s000, -8(rp)
+LDEF(0_one_out)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+
+	add		climb, p000a, s000
+	add,dc		p064a, %r0, climb
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	sub		r000, s000, s000
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s000, 0(rp)
+
+	cmpib,>=	4, n, L(done)
+	ldo		8(rp), rp
+
+C 4-way unrolled code.
+
+LDEF(BIG)
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+define(`p096b1',`%r20')	C
+define(`p096b2',`%r21')	C
+define(`p160c1',`%r22')	C
+define(`p160c2',`%r29')	C
+define(`p224d1',`%r31')	C
+define(`p224d2',`%r3')	C
+			C
+define(`m032',`%r4')	C
+define(`m096',`%r5')	C
+define(`m160',`%r6')	C
+define(`m224',`%r7')	C
+define(`m288',`%r8')	C
+			C
+define(`p000a',`%r1')	C
+define(`p064a',`%r19')	C
+define(`p064b',`%r20')	C
+define(`p128b',`%r21')	C
+define(`p128c',`%r22')	C
+define(`p192c',`%r29')	C
+define(`p192d',`%r31')	C
+define(`p256d',`%r3')	C
+			C
+define(`s000',`%r10')	C
+define(`s064',`%r11')	C
+define(`s128',`%r12')	C
+define(`s192',`%r13')	C
+			C
+define(`ma000',`%r9')	C
+define(`ma064',`%r4')	C
+define(`ma128',`%r5')	C
+define(`ma192',`%r6')	C
+define(`ma256',`%r7')	C
+			C
+define(`r000',`%r1')	C
+define(`r064',`%r19')	C
+define(`r128',`%r20')	C
+define(`r192',`%r21')	C
+
+	std		%r6, -0xe8(%r30)
+	std		%r7, -0xe0(%r30)
+	std		%r8, -0xd8(%r30)
+	std		%r9, -0xd0(%r30)
+	std		%r10, -0xc8(%r30)
+	std		%r11, -0xc0(%r30)
+	std		%r12, -0xb8(%r30)
+	std		%r13, -0xb0(%r30)
+
+ifdef(`HAVE_ABI_2_0w',
+`	extrd,u		n, 61, 62, n		C right shift 2
+',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
+')
+
+LDEF(4_or_more)
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,<>	-1, n, L(8_or_more)
+	xmpyu		%fr8L, %fr7L, %fr27
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	b		L(end1)
+	nop
+
+LDEF(8_or_more)
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,=		-1, n, L(end2)
+	xmpyu		%fr8L, %fr7L, %fr27
+LDEF(loop)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+
+	add,dc		ma128, s128, s128	C accum mid 2
+	fldd		0(up), %fr4
+	add,dc		ma192, s192, s192	C accum mid 3
+	fldd		8(up), %fr5
+
+	add,dc		ma256, climb, climb
+	fldd		16(up), %fr6
+	sub		r000, s000, s000	C accum rlimb 0
+	fldd		24(up), %fr7
+
+	sub,db		r064, s064, s064	C accum rlimb 1
+	sub,db		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+
+	sub,db		r192, s192, s192	C accum rlimb 3
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s064, 8(rp)
+
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	std		s128, 16(rp)
+
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	std		s192, 24(rp)
+
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	xmpyu		%fr8L, %fr7L, %fr27
+
+	addib,<>	-1, n, L(loop)
+	ldo		32(rp), rp
+
+LDEF(end2)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	sub		r000, s000, s000	C accum rlimb 0
+	sub,db		r064, s064, s064	C accum rlimb 1
+	sub,db		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	sub,db		r192, s192, s192	C accum rlimb 3
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s064, 8(rp)
+	ldd		-0x78(%r30), p032a1
+	std		s128, 16(rp)
+	ldd		-0x70(%r30), p032a2
+	std		s192, 24(rp)
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	ldo		32(rp), rp
+
+LDEF(end1)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	sub		r000, s000, s000	C accum rlimb 0
+	sub,db		r064, s064, s064	C accum rlimb 1
+	sub,db		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	sub,db		r192, s192, s192	C accum rlimb 3
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s064, 8(rp)
+	std		s128, 16(rp)
+	std		s192, 24(rp)
+
+	ldd		-0xb0(%r30), %r13
+	ldd		-0xb8(%r30), %r12
+	ldd		-0xc0(%r30), %r11
+	ldd		-0xc8(%r30), %r10
+	ldd		-0xd0(%r30), %r9
+	ldd		-0xd8(%r30), %r8
+	ldd		-0xe0(%r30), %r7
+	ldd		-0xe8(%r30), %r6
+LDEF(done)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		climb, %r28
+',`	extrd,u		climb, 63, 32, %r29
+	extrd,u		climb, 31, 32, %r28
+')
+	ldd		-0xf0(%r30), %r5
+	ldd		-0xf8(%r30), %r4
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/pa64/udiv.asm b/third_party/gmp/mpn/pa64/udiv.asm
new file mode 100644
index 0000000..1380a85
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/udiv.asm

@@ -0,0 +1,125 @@
+dnl  HP-PA 2.0 64-bit mpn_udiv_qrnnd_r.
+
+dnl  Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This runs at about 280 cycles on both PA8000 and PA8500, corresponding to a
+C bit more than 4 cycles/bit.
+
+C INPUT PARAMETERS
+define(`n1',`%r26')
+define(`n0',`%r25')
+define(`d',`%r24')
+define(`remptr',`%r23')
+
+define(`q',`%r28')
+define(`dn',`%r29')
+
+define(`old_divstep',
+       `add,dc		n0,n0,n0
+	add,dc		n1,n1,n1
+	sub,*<<		n1,d,%r22
+	copy		%r22,n1')
+
+define(`divstep',
+       `add		n0,n0,n0
+	add,dc		n1,n1,n1
+	sub		n1,d,%r1
+	add,dc		q,q,q
+	cmpclr,*<<	n1,d,%r0
+	copy		%r1,n1
+')
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_udiv_qrnnd_r)
+ifdef(`HAVE_ABI_2_0n',
+`	depd		%r25,31,32,%r26
+	depd		%r23,31,32,%r24
+	copy		%r24,%r25
+	ldd		-56(%r30),%r24
+	ldw		-60(%r30),%r23
+')
+	ldi		0,q
+	cmpib,*>=	0,d,L(large_divisor)
+	ldi		8,%r31		C setup loop counter
+
+	sub		%r0,d,dn
+LDEF(Loop)
+	divstep divstep divstep divstep divstep divstep divstep divstep
+	addib,<>	-1,%r31,L(Loop)
+	nop
+
+ifdef(`HAVE_ABI_2_0n',
+`	copy		%r28,%r29
+	extrd,u		%r28,31,32,%r28
+')
+	bve		(%r2)
+	std		n1,0(remptr)	C store remainder
+
+LDEF(large_divisor)
+	extrd,u		n0,63,1,%r19	C save lsb of dividend
+	shrpd		n1,n0,1,n0	C n0 = lo(n1n0 >> 1)
+	shrpd		%r0,n1,1,n1	C n1 = hi(n1n0 >> 1)
+	extrd,u		d,63,1,%r20	C save lsb of divisor
+	shrpd		%r0,d,1,d	C d = floor(orig_d / 2)
+	add,l		%r20,d,d	C d = ceil(orig_d / 2)
+
+	sub		%r0,d,dn
+LDEF(Loop2)
+	divstep divstep divstep divstep divstep divstep divstep divstep
+	addib,<>	-1,%r31,L(Loop2)
+	nop
+
+	cmpib,*=	0,%r20,L(even_divisor)
+	shladd		n1,1,%r19,n1	C shift in omitted dividend lsb
+
+	add		d,d,d		C restore orig...
+	sub		d,%r20,d	C ...d value
+	sub		%r0,d,dn	C r21 = -d
+
+	add,*nuv	n1,q,n1		C fix remainder for omitted divisor lsb
+	add,l		n1,dn,n1	C adjust remainder if rem. fix carried
+	add,dc		%r0,q,q		C adjust quotient accordingly
+
+	sub,*<<		n1,d,%r0	C remainder >= divisor?
+	add,l		n1,dn,n1	C adjust remainder
+	add,dc		%r0,q,q		C adjust quotient
+
+LDEF(even_divisor)
+ifdef(`HAVE_ABI_2_0n',
+`	copy		%r28,%r29
+	extrd,u		%r28,31,32,%r28
+')
+	bve		(%r2)
+	std		n1,0(remptr)	C store remainder
+EPILOGUE(mpn_udiv_qrnnd_r)

diff --git a/third_party/gmp/mpn/pa64/umul.asm b/third_party/gmp/mpn/pa64/umul.asm
new file mode 100644
index 0000000..bd5a71f
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/umul.asm

@@ -0,0 +1,97 @@
+dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Optimizations:
+dnl  * Avoid skip instructions
+dnl  * Put carry-generating and carry-consuming insns consecutively
+dnl  * Don't allocate any stack, "home" positions for parameters could be used.
+
+include(`../config.m4')
+
+define(`p0',`%r28')
+define(`p1',`%r29')
+define(`t32',`%r19')
+define(`t0',`%r20')
+define(`t1',`%r21')
+define(`x',`%r22')
+define(`m0',`%r23')
+define(`m1',`%r24')
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_umul_ppmm_r)
+	ldo		128(%r30),%r30
+ifdef(`HAVE_ABI_2_0w',
+`	std		%r26,-64(%r30)
+	std		%r25,-56(%r30)
+	copy		%r24,%r31
+',`
+	depd		%r25,31,32,%r26
+	std		%r26,-64(%r30)
+	depd		%r23,31,32,%r24
+	std		%r24,-56(%r30)
+	ldw		-180(%r30),%r31
+')
+
+	fldd		-64(%r30),%fr4
+	fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		C t32 = 2^32
+
+	ldd		-128(%r30),p0		C lo = low 64 bit of product
+	ldd		-120(%r30),m0		C m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		C m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		C hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			C x = m1+m0
+	 add,l		t32,p1,p1		C propagate carry to mid of p1
+	depd,z		x,31,32,t0		C lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		C hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		C store low half of product
+ifdef(`HAVE_ABI_2_0w',
+`	copy		p1,%r28			C return val in %r28
+',`	extrd,u		p1,31,32,%r28		C return val in %r28,%r29
+')
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+EPILOGUE(mpn_umul_ppmm_r)
commit	dace2a60b36477ef4295dea0cd01ae43a65f986f	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Tue Aug 18 10:56:48 2020 -0700
committer	Austin Schuh <austin.linux@gmail.com>	Wed Aug 19 16:14:33 2020 -0700
tree	f6c709631ecad56062e08308ae06dd1425613c36
parent	1b6c20f6a7928823e49afda98bf720d99c5689ba [diff]