Add libgmp 6.2.0 to third_party

Don't build it yet.  That will come in the next review.

Change-Id: Idf3266558165e5ab45f4a41c98cc8c838c8244d5
diff --git a/third_party/gmp/mpn/ia64/README b/third_party/gmp/mpn/ia64/README
new file mode 100644
index 0000000..45c2d63
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/README
@@ -0,0 +1,281 @@
+Copyright 2000-2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+                      IA-64 MPN SUBROUTINES
+
+
+This directory contains mpn functions for the IA-64 architecture.
+
+
+CODE ORGANIZATION
+
+	mpn/ia64          itanium-2, and generic ia64
+
+The code here has been optimized primarily for Itanium 2.  Very few Itanium 1
+chips were ever sold, and Itanium 2 is more powerful, so the latter is what
+we concentrate on.
+
+
+
+CHIP NOTES
+
+The IA-64 ISA keeps instructions three and three in 128 bit bundles.
+Programmers/compilers need to put explicit breaks `;;' when there are WAW or
+RAW dependencies, with some notable exceptions.  Such "breaks" are typically
+at the end of a bundle, but can be put between operations within some bundle
+types too.
+
+The Itanium 1 and Itanium 2 implementations can under ideal conditions
+execute two bundles per cycle.  The Itanium 1 allows 4 of these instructions
+to do integer operations, while the Itanium 2 allows all 6 to be integer
+operations.
+
+Taken cloop branches seem to insert a bubble into the pipeline most of the
+time on Itanium 1.
+
+Loads to the fp registers bypass the L1 cache and thus get extremely long
+latencies, 9 cycles on the Itanium 1 and 6 cycles on the Itanium 2.
+
+The software pipeline stuff using br.ctop instruction causes delays, since
+many issue slots are taken up by instructions with zero predicates, and
+since many extra instructions are needed to set things up.  These features
+are clearly designed for code density, not speed.
+
+Misc pipeline limitations (Itanium 1):
+* The getf.sig instruction can only execute in M0.
+* At most four integer instructions/cycle.
+* Nops take up resources like any plain instructions.
+
+Misc pipeline limitations (Itanium 2):
+* The getf.sig instruction can only execute in M0.
+* Nops take up resources like any plain instructions.
+
+
+ASSEMBLY SYNTAX
+
+.align pads with nops in a text segment, but gas 2.14 and earlier
+incorrectly byte-swaps its nop bundle in big endian mode (eg. hpux), making
+it come out as break instructions.  We use the ALIGN() macro in
+mpn/ia64/ia64-defs.m4 when it might be executed across.  That macro
+suppresses any .align if the problem is detected by configure.  Lack of
+alignment might hurt performance but will at least be correct.
+
+foo:: to create a global symbol is not accepted by gas.  Use separate
+".global foo" and "foo:" instead.
+
+.global is the standard global directive.  gas accepts .globl, but hpux "as"
+doesn't.
+
+.proc / .endp generates the appropriate .type and .size information for ELF,
+so the latter directives don't need to be given explicitly.
+
+.pred.rel "mutex"... is standard for annotating predicate register
+relationships.  gas also accepts .pred.rel.mutex, but hpux "as" doesn't.
+
+.pred directives can't be put on a line with a label, like
+".Lfoo: .pred ...", the HP assembler on HP-UX 11.23 rejects that.
+gas is happy with it, and past versions of HP had seemed ok.
+
+// is the standard comment sequence, but we prefer "C" since it inhibits m4
+macro expansion.  See comments in ia64-defs.m4.
+
+
+REGISTER USAGE
+
+Special:
+   r0: constant 0
+   r1: global pointer (gp)
+   r8: return value
+   r12: stack pointer (sp)
+   r13: thread pointer (tp)
+Caller-saves: r8-r11 r14-r31 f6-f15 f32-f127
+Caller-saves but rotating: r32-
+
+
+================================================================
+mpn_add_n, mpn_sub_n:
+
+The current code runs at 1.25 c/l on Itanium 2.
+
+================================================================
+mpn_mul_1:
+
+The current code runs at 2 c/l on Itanium 2.
+
+Using a blocked approach, working off of 4 separate places in the operands,
+one could make use of the xma accumulation, and approach 1 c/l.
+
+	ldf8 [up]
+	xma.l
+	xma.hu
+	stf8  [wrp]
+
+================================================================
+mpn_addmul_1:
+
+The current code runs at 2 c/l on Itanium 2.
+
+It seems possible to use a blocked approach, as with mpn_mul_1.  We should
+read rp[] to integer registers, allowing for just one getf.sig per cycle.
+
+	ld8  [rp]
+	ldf8 [up]
+	xma.l
+	xma.hu
+	getf.sig
+	add+add+cmp+cmp
+	st8  [wrp]
+
+These 10 instructions can be scheduled to approach 1.667 cycles, and with
+the 4 cycle latency of xma, this means we need at least 3 blocks.  Using
+ldfp8 we could approach 1.583 c/l.
+
+================================================================
+mpn_submul_1:
+
+The current code runs at 2.25 c/l on Itanium 2.  Getting to 2 c/l requires
+ldfp8 with all alignment headache that implies.
+
+================================================================
+mpn_addmul_N
+
+For best speed, we need to give up using mpn_addmul_2 as the main multiply
+building block, and instead take multiple v limbs per loop.  For the Itanium
+1, we need to take about 8 limbs at a time for full speed.  For the Itanium
+2, something like mpn_addmul_4 should be enough.
+
+The add+cmp+cmp+add we use on the other codes is optimal for shortening
+recurrencies (1 cycle) but the sequence takes up 4 execution slots.  When
+recurrency depth is not critical, a more standard 3-cycle add+cmp+add is
+better.
+
+/* First load the 8 values from v */
+	ldfp8		v0, v1 = [r35], 16;;
+	ldfp8		v2, v3 = [r35], 16;;
+	ldfp8		v4, v5 = [r35], 16;;
+	ldfp8		v6, v7 = [r35], 16;;
+
+/* In the inner loop, get a new U limb and store a result limb. */
+	mov		lc = un
+Loop:	ldf8		u0 = [r33], 8
+	ld8		r0 = [r32]
+	xma.l		lp0 = v0, u0, hp0
+	xma.hu		hp0 = v0, u0, hp0
+	xma.l		lp1 = v1, u0, hp1
+	xma.hu		hp1 = v1, u0, hp1
+	xma.l		lp2 = v2, u0, hp2
+	xma.hu		hp2 = v2, u0, hp2
+	xma.l		lp3 = v3, u0, hp3
+	xma.hu		hp3 = v3, u0, hp3
+	xma.l		lp4 = v4, u0, hp4
+	xma.hu		hp4 = v4, u0, hp4
+	xma.l		lp5 = v5, u0, hp5
+	xma.hu		hp5 = v5, u0, hp5
+	xma.l		lp6 = v6, u0, hp6
+	xma.hu		hp6 = v6, u0, hp6
+	xma.l		lp7 = v7, u0, hp7
+	xma.hu		hp7 = v7, u0, hp7
+	getf.sig	l0 = lp0
+	getf.sig	l1 = lp1
+	getf.sig	l2 = lp2
+	getf.sig	l3 = lp3
+	getf.sig	l4 = lp4
+	getf.sig	l5 = lp5
+	getf.sig	l6 = lp6
+	add+cmp+add	xx, l0, r0
+	add+cmp+add	acc0, acc1, l1
+	add+cmp+add	acc1, acc2, l2
+	add+cmp+add	acc2, acc3, l3
+	add+cmp+add	acc3, acc4, l4
+	add+cmp+add	acc4, acc5, l5
+	add+cmp+add	acc5, acc6, l6
+	getf.sig	acc6 = lp7
+	st8		[r32] = xx, 8
+	br.cloop Loop
+
+	49 insn at max 6 insn/cycle:		8.167 cycles/limb8
+	11 memops at max 2 memops/cycle:	5.5 cycles/limb8
+	16 fpops at max 2 fpops/cycle:		8 cycles/limb8
+	21 intops at max 4 intops/cycle:	5.25 cycles/limb8
+	11+21 memops+intops at max 4/cycle	8 cycles/limb8
+
+================================================================
+mpn_lshift, mpn_rshift
+
+The current code runs at 1 cycle/limb on Itanium 2.
+
+Using 63 separate loops, we could use the double-word shrp instruction.
+That instruction has a plain single-cycle latency.  We need 63 loops since
+this instruction only accept immediate count.  That would lead to a somewhat
+silly code size, but the speed would be 0.75 c/l on Itanium 2 (by using shrp
+each cycle plus shl/shr going down I1 for a further limb every second
+cycle).
+
+================================================================
+mpn_copyi, mpn_copyd
+
+The current code runs at 0.5 c/l on Itanium 2.  But that is just for L1
+cache hit.  The 4-way unrolled loop takes just 2 cycles, and thus load-use
+scheduling isn't great.  It might be best to actually use modulo scheduled
+loops, since that will allow us to do better load-use scheduling without too
+much unrolling.
+
+Depending on size or operand alignment, we get 1 c/l or 0.5 c/l on Itanium
+2, according to tune/speed.  Cache bank conflicts?
+
+
+
+REFERENCES
+
+Intel Itanium Architecture Software Developer's Manual, volumes 1 to 3,
+Intel document 245317-004, 245318-004, 245319-004 October 2002.  Volume 1
+includes an Itanium optimization guide.
+
+Intel Itanium Processor-specific Application Binary Interface (ABI), Intel
+document 245370-003, May 2001.  Describes C type sizes, dynamic linking,
+etc.
+
+Intel Itanium Architecture Assembly Language Reference Guide, Intel document
+248801-004, 2000-2002.  Describes assembly instruction syntax and other
+directives.
+
+Itanium Software Conventions and Runtime Architecture Guide, Intel document
+245358-003, May 2001.  Describes calling conventions, including stack
+unwinding requirements.
+
+Intel Itanium Processor Reference Manual for Software Optimization, Intel
+document 245473-003, November 2001.
+
+Intel Itanium-2 Processor Reference Manual for Software Development and
+Optimization, Intel document 251110-003, May 2004.
+
+All the above documents can be found online at
+
+    http://developer.intel.com/design/itanium/manuals.htm
diff --git a/third_party/gmp/mpn/ia64/add_n_sub_n.asm b/third_party/gmp/mpn/ia64/add_n_sub_n.asm
new file mode 100644
index 0000000..c15afaa
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/add_n_sub_n.asm
@@ -0,0 +1,307 @@
+dnl  IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    2.25
+
+C INPUT PARAMETERS
+define(`sp', `r32')
+define(`dp', `r33')
+define(`up', `r34')
+define(`vp', `r35')
+define(`n',  `r36')
+
+C Some useful aliases for registers we use
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
+define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27')
+define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31')
+define(`up0',`up')
+define(`up1',`r14')
+define(`vp0',`vp')
+define(`vp1',`r15')
+
+
+ASM_START()
+PROLOGUE(mpn_add_n_sub_n)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4	sp = 0, sp		C				M I
+	addp4	dp = 0, dp		C				M I
+	nop.i	0
+	addp4	up = 0, up		C				M I
+	addp4	vp = 0, vp		C				M I
+	zxt4	n = n			C				I
+	;;
+')
+
+	and	r9 = 3, n		C				M I
+	mov.i	r2 = ar.lc		C				I0
+	add	up1 = 8, up0		C				M I
+	add	vp1 = 8, vp0		C				M I
+	add	r8 = -2, n		C				M I
+	add	r10 = 256, up		C				M I
+	;;
+	shr.u	r8 = r8, 2		C				I0
+	cmp.eq	p10, p0 = 0, r9		C				M I
+	cmp.eq	p11, p0 = 2, r9		C				M I
+	cmp.eq	p12, p0 = 3, r9		C				M I
+	add	r11 = 256, vp		C				M I
+	;;
+	mov.i	ar.lc = r8		C				I0
+  (p10)	br	L(b0)			C				B
+  (p11)	br	L(b2)			C				B
+  (p12)	br	L(b3)			C				B
+
+L(b1):	ld8	u3 = [up0], 8		C				M01
+	add	up1 = 8, up1		C				M I
+	cmpltu	p14, p15 = 4, n		C				M I
+	ld8	v3 = [vp0], 8		C				M01
+	add	vp1 = 8, vp1		C				M I
+	;;
+	add	s3 = u3, v3		C				M I
+	sub	d3 = u3, v3		C				M I
+	mov	r8 = 0			C				M I
+	;;
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+  (p15)	br	L(cj1)			C				B
+	st8	[sp] = s3, 8		C				M23
+	st8	[dp] = d3, 8		C				M23
+	br	L(c0)			C				B
+
+L(b0):	cmp.ne	p9, p0 = r0, r0		C				M I
+	cmp.ne	p13, p0 = r0, r0	C				M I
+L(c0):	ld8	u0 = [up0], 16		C				M01
+	ld8	u1 = [up1], 16		C				M01
+	;;
+	ld8	v0 = [vp0], 16		C				M01
+	ld8	v1 = [vp1], 16		C				M01
+	;;
+	ld8	u2 = [up0], 16		C				M01
+	ld8	u3 = [up1], 16		C				M01
+	;;
+	ld8	v2 = [vp0], 16		C				M01
+	ld8	v3 = [vp1], 16		C				M01
+	;;
+	add	s0 = u0, v0		C				M I
+	add	s1 = u1, v1		C				M I
+	sub	d0 = u0, v0		C				M I
+	sub	d1 = u1, v1		C				M I
+	;;
+	cmpltu	p6, p0 = s0, v0		C  carry from add0		M I
+	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
+	cmpltu	p10, p0 = u0, v0	C borrow from sub0		M I
+	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
+	;;
+	nop	0			C
+	br.cloop.dptk	L(top)		C				B
+	br	L(end)			C				B
+
+L(b3):	ld8	u1 = [up0], 8		C				M01
+	add	up1 = 8, up1		C				M I
+	ld8	v1 = [vp0], 8		C				M01
+	;;
+	add	vp1 = 8, vp1		C				M I
+	add	s1 = u1, v1		C				M I
+	sub	d1 = u1, v1		C				M I
+	;;
+	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
+	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
+	;;
+	st8	[sp] = s1, 8		C				M23
+	st8	[dp] = d1, 8		C				M23
+	br	L(c2)			C				B
+
+	ALIGN(32)
+L(b2):	cmp.ne	p7, p0 = r0, r0		C				M I
+	cmp.ne	p11, p0 = r0, r0	C				M I
+	nop	0
+L(c2):	ld8	u2 = [up0], 16		C				M01
+	ld8	u3 = [up1], 16		C				M01
+	cmpltu	p14, p0 = 4, n		C				M I
+	;;
+	ld8	v2 = [vp0], 16		C				M01
+	ld8	v3 = [vp1], 16		C				M01
+  (p14)	br	L(gt4)			C				B
+	;;
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	cmpltu	p8, p0 = s2, v2		C  carry from add0		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+	br	L(cj2)			C				B
+	;;
+L(gt4):	ld8	u0 = [up0], 16		C				M01
+	ld8	u1 = [up1], 16		C				M01
+	;;
+	ld8	v0 = [vp0], 16		C				M01
+	ld8	v1 = [vp1], 16		C				M01
+	;;
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	cmpltu	p8, p0 = s2, v2		C  carry from add0		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add1		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub0		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub1		M I
+	br.cloop.dptk	L(mid)		C				B
+
+	ALIGN(32)
+L(top):
+	ld8	u0 = [up0], 16		C				M01
+	ld8	u1 = [up1], 16		C				M01
+   (p9)	cmpeqor	p6, p0 = -1, s0		C				M I
+   (p9)	add	s0 = 1, s0		C				M I
+  (p13)	cmpeqor	p10, p0 = 0, d0		C				M I
+  (p13)	add	d0 = -1, d0		C				M I
+	;;
+	ld8	v0 = [vp0], 16		C				M01
+	ld8	v1 = [vp1], 16		C				M01
+   (p6)	cmpeqor	p7, p0 = -1, s1		C				M I
+   (p6)	add	s1 = 1, s1		C				M I
+  (p10)	cmpeqor	p11, p0 = 0, d1		C				M I
+  (p10)	add	d1 = -1, d1		C				M I
+	;;
+	st8	[sp] = s0, 8		C				M23
+	st8	[dp] = d0, 8		C				M23
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	st8	[sp] = s1, 8		C				M23
+	st8	[dp] = d1, 8		C				M23
+	cmpltu	p8, p0 = s2, v2		C  carry from add2		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+	;;
+L(mid):
+	ld8	u2 = [up0], 16		C				M01
+	ld8	u3 = [up1], 16		C				M01
+   (p7)	cmpeqor	p8, p0 = -1, s2		C				M I
+   (p7)	add	s2 = 1, s2		C				M I
+  (p11)	cmpeqor	p12, p0 = 0, d2		C				M I
+  (p11)	add	d2 = -1, d2		C				M I
+	;;
+	ld8	v2 = [vp0], 16		C				M01
+	ld8	v3 = [vp1], 16		C				M01
+   (p8)	cmpeqor	p9, p0 = -1, s3		C				M I
+   (p8)	add	s3 = 1, s3		C				M I
+  (p12)	cmpeqor	p13, p0 = 0, d3		C				M I
+  (p12)	add	d3 = -1, d3		C				M I
+	;;
+	st8	[sp] = s2, 8		C				M23
+	st8	[dp] = d2, 8		C				M23
+	add	s0 = u0, v0		C				M I
+	add	s1 = u1, v1		C				M I
+	sub	d0 = u0, v0		C				M I
+	sub	d1 = u1, v1		C				M I
+	;;
+	st8	[sp] = s3, 8		C				M23
+	st8	[dp] = d3, 8		C				M23
+	cmpltu	p6, p0 = s0, v0		C  carry from add0		M I
+	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
+	cmpltu	p10, p0 = u0, v0	C borrow from sub0		M I
+	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
+	;;
+	lfetch	[r10], 32		C				M?
+	lfetch	[r11], 32		C				M?
+	br.cloop.dptk	L(top)		C				B
+	;;
+
+L(end):
+	nop	0
+	nop	0
+   (p9)	cmpeqor	p6, p0 = -1, s0		C				M I
+   (p9)	add	s0 = 1, s0		C				M I
+  (p13)	cmpeqor	p10, p0 = 0, d0		C				M I
+  (p13)	add	d0 = -1, d0		C				M I
+	;;
+	nop	0
+	nop	0
+   (p6)	cmpeqor	p7, p0 = -1, s1		C				M I
+   (p6)	add	s1 = 1, s1		C				M I
+  (p10)	cmpeqor	p11, p0 = 0, d1		C				M I
+  (p10)	add	d1 = -1, d1		C				M I
+	;;
+	st8	[sp] = s0, 8		C				M23
+	st8	[dp] = d0, 8		C				M23
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	st8	[sp] = s1, 8		C				M23
+	st8	[dp] = d1, 8		C				M23
+	cmpltu	p8, p0 = s2, v2		C  carry from add2		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+	;;
+L(cj2):
+   (p7)	cmpeqor	p8, p0 = -1, s2		C				M I
+   (p7)	add	s2 = 1, s2		C				M I
+  (p11)	cmpeqor	p12, p0 = 0, d2		C				M I
+  (p11)	add	d2 = -1, d2		C				M I
+	mov	r8 = 0			C				M I
+	nop	0
+	;;
+	st8	[sp] = s2, 8		C				M23
+	st8	[dp] = d2, 8		C				M23
+   (p8)	cmpeqor	p9, p0 = -1, s3		C				M I
+   (p8)	add	s3 = 1, s3		C				M I
+  (p12)	cmpeqor	p13, p0 = 0, d3		C				M I
+  (p12)	add	d3 = -1, d3		C				M I
+	;;
+L(cj1):
+   (p9)	mov	r8 = 2			C				M I
+	;;
+	mov.i	ar.lc = r2		C				I0
+  (p13)	add	r8 = 1, r8		C				M I
+	st8	[sp] = s3		C				M23
+	st8	[dp] = d3		C				M23
+	br.ret.sptk.many b0		C				B
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/addmul_1.asm b/third_party/gmp/mpn/ia64/addmul_1.asm
new file mode 100644
index 0000000..ffa3297
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/addmul_1.asm
@@ -0,0 +1,602 @@
+dnl  IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    3.0
+C Itanium 2:  2.0
+
+C TODO
+C  * Further optimize feed-in and wind-down code, both for speed and code size.
+C  * Handle low limb input and results specially, using a common stf8 in the
+C    epilogue.
+C  * Use 1 c/l carry propagation scheme in wind-down code.
+C  * Use extra pointer registers for `up' and rp to speed up feed-in loads.
+C  * Work out final differences with mul_1.asm.  That function is 300 bytes
+C    smaller than this due to better loop scheduling and thus simpler feed-in
+C    code.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmi
+	adds		r15 = -1, n		C M I
+	mov		r20 = rp		C M I
+	mov.i		r2 = ar.lc		C I0
+}
+{.mmi
+	ldf8		f7 = [up], 8		C M
+	ldf8		f8 = [rp], 8		C M
+	and		r14 = 3, n		C M I
+	;;
+}
+{.mmi
+	setf.sig	f6 = vl			C M2 M3
+	cmp.eq		p10, p0 = 0, r14	C M I
+	shr.u		r31 = r15, 2		C I0
+}
+{.mmi
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	nop.i		0			C I
+	;;
+}
+{.mii
+	cmp.ne		p6, p7 = r0, r0		C M I
+	mov.i		ar.lc = r31		C I0
+	cmp.ne		p8, p9 = r0, r0		C M I
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	br.cloop.dptk	.grt1			C B
+
+	xma.l		f39 = f7, f6, f8	C F
+	xma.hu		f43 = f7, f6, f8	C F
+	;;
+	getf.sig	r8 = f43		C M2
+	stf8		[r20] = f39		C M2 M3
+	mov.i		ar.lc = r2		C I0
+	br.ret.sptk.many b0			C B
+
+.grt1:
+	ldf8		f32 = [up], 8
+	ldf8		f44 = [rp], 8
+	;;
+	ldf8		f33 = [up], 8
+	ldf8		f45 = [rp], 8
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f7, f6, f8
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f7, f6, f8
+	;;
+	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt5
+
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	stf8		[r20] = f39, 8
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43
+	getf.sig	r24 = f36
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40
+	getf.sig	r25 = f37
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41
+	getf.sig	r26 = f38
+	br		.Lcj5
+
+.grt5:
+	mov		r30 = 0
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r31 = f43
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	getf.sig	r24 = f36
+	;;
+	getf.sig	r28 = f40
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	getf.sig	r25 = f37
+	br.cloop.dptk	.Loop
+	br		.Le0
+
+
+.Lb10:	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt2
+
+	xma.l		f38 = f7, f6, f8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r30 = f42
+	stf8		[r20] = f38, 8
+	getf.sig	r27 = f39
+	getf.sig	r8 = f43
+	br		.Lcj2
+
+.grt2:
+	ldf8		f32 = [up], 8
+	ldf8		f44 = [rp], 8
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f7, f6, f8
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt6
+
+	stf8		[r20] = f38, 8
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	getf.sig	r30 = f42
+	getf.sig	r27 = f39
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43
+	getf.sig	r24 = f36
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40
+	getf.sig	r25 = f37
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	br		.Lcj6
+
+.grt6:
+	mov		r29 = 0
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	getf.sig	r26 = f38
+	;;
+	getf.sig	r30 = f42
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r31 = f43
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	getf.sig	r24 = f36
+	br		.LL10
+
+
+.Lb11:	ldf8		f34 = [up], 8
+	ldf8		f46 = [rp], 8
+	;;
+	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt3
+	;;
+
+	xma.l		f37 = f7, f6, f8
+	xma.hu		f41 = f7, f6, f8
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41
+	stf8		[r20] = f37, 8
+	getf.sig	r26 = f38
+	getf.sig	r30 = f42
+	getf.sig	r27 = f39
+	getf.sig	r8 = f43
+	br		.Lcj3
+
+.grt3:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f7, f6, f8
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f7, f6, f8
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	getf.sig	r25 = f37		C FIXME
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt7
+
+	getf.sig	r29 = f41
+	stf8		[r20] = f37, 8		C FIXME
+	xma.l		f36 = f32, f6, f44
+	getf.sig	r26 = f38
+	xma.hu		f40 = f32, f6, f44
+	;;
+	getf.sig	r30 = f42
+	xma.l		f37 = f33, f6, f45
+	getf.sig	r27 = f39
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43
+	xma.l		f38 = f34, f6, f46
+	getf.sig	r24 = f36
+	xma.hu		f42 = f34, f6, f46
+	br		.Lcj7
+
+.grt7:
+	getf.sig	r29 = f41
+	xma.l		f36 = f32, f6, f44
+	mov		r28 = 0
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	getf.sig	r26 = f38
+	;;
+	getf.sig	r30 = f42
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	getf.sig	r27 = f39
+	br		.LL11
+
+
+.Lb00:	ldf8		f33 = [up], 8
+	ldf8		f45 = [rp], 8
+	;;
+	ldf8		f34 = [up], 8
+	ldf8		f46 = [rp], 8
+	;;
+	ldf8		f35 = [up], 8
+	xma.l		f36 = f7, f6, f8
+	ldf8		f47 = [rp], 8
+	xma.hu		f40 = f7, f6, f8
+	br.cloop.dptk	.grt4
+
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40
+	stf8		[r20] = f36, 8
+	xma.l		f39 = f35, f6, f47
+	getf.sig	r25 = f37
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41
+	getf.sig	r26 = f38
+	getf.sig	r30 = f42
+	getf.sig	r27 = f39
+	br		.Lcj4
+
+.grt4:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	getf.sig	r24 = f36		C FIXME
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	getf.sig	r25 = f37
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt8
+
+	getf.sig	r29 = f41
+	stf8		[r20] = f36, 8		C FIXME
+	xma.l		f36 = f32, f6, f44
+	getf.sig	r26 = f38
+	getf.sig	r30 = f42
+	xma.hu		f40 = f32, f6, f44
+	;;
+	xma.l		f37 = f33, f6, f45
+	getf.sig	r27 = f39
+	xma.hu		f41 = f33, f6, f45
+	br		.Lcj8
+
+.grt8:
+	getf.sig	r29 = f41
+	xma.l		f36 = f32, f6, f44
+	mov		r31 = 0
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	getf.sig	r26 = f38
+	br		.LL00
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)				C insn	fed	cycle #
+.Loop:
+	.pred.rel "mutex", p6, p7		C num	by	i1 i2
+	getf.sig	r29 = f41		C 00	16	0   0
+	xma.l		f36 = f32, f6, f44	C 01	06,15	0   0
+   (p6)	add		r14 = r30, r27, 1	C 02		0   0
+	ldf8		f47 = [rp], 8		C 03		0   0
+	xma.hu		f40 = f32, f6, f44	C 04	06,15	0   0
+   (p7)	add		r14 = r30, r27		C 05		0   0
+	;;
+	.pred.rel "mutex", p6, p7
+	ldf8		f32 = [up], 8		C 06		1   1
+   (p6)	cmp.leu		p8, p9 = r14, r27	C 07		1   1
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C 08		1   1
+	getf.sig	r26 = f38		C 09	25	2   1
+	st8		[r20] = r14, 8		C 10		2   1
+	nop.b		0			C 11		2   1
+	;;
+.LL00:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r30 = f42		C 12	28	3   2
+	xma.l		f37 = f33, f6, f45	C 13	18,27	3   2
+   (p8)	add		r16 = r31, r24, 1	C 14		3   2
+	ldf8		f44 = [rp], 8		C 15		3   2
+	xma.hu		f41 = f33, f6, f45	C 16	18,27	3   2
+   (p9)	add		r16 = r31, r24		C 17		3   2
+	;;
+	.pred.rel "mutex", p8, p9
+	ldf8		f33 = [up], 8		C 18		4   3
+   (p8)	cmp.leu		p6, p7 = r16, r24	C 19		4   3
+   (p9)	cmp.ltu		p6, p7 = r16, r24	C 20		4   3
+	getf.sig	r27 = f39		C 21	37	5   3
+	st8		[r20] = r16, 8		C 22		5   3
+	nop.b		0			C 23		5   3
+	;;
+.LL11:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r31 = f43		C 24	40	6   4
+	xma.l		f38 = f34, f6, f46	C 25	30,39	6   4
+   (p6)	add		r14 = r28, r25, 1	C 26		6   4
+	ldf8		f45 = [rp], 8		C 27		6   4
+	xma.hu		f42 = f34, f6, f46	C 28	30,39	6   4
+   (p7)	add		r14 = r28, r25		C 29		6   4
+	;;
+	.pred.rel "mutex", p6, p7
+	ldf8		f34 = [up], 8		C 30		7   5
+   (p6)	cmp.leu		p8, p9 = r14, r25	C 31		7   5
+   (p7)	cmp.ltu		p8, p9 = r14, r25	C 32		7   5
+	getf.sig	r24 = f36		C 33	01	8   5
+	st8		[r20] = r14, 8		C 34		8   5
+	nop.b		0			C 35		8   5
+	;;
+.LL10:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r28 = f40		C 36	04	9   6
+	xma.l		f39 = f35, f6, f47	C 37	42,03	9   6
+   (p8)	add		r16 = r29, r26, 1	C 38		9   6
+	ldf8		f46 = [rp], 8		C 39		9   6
+	xma.hu		f43 = f35, f6, f47	C 40	42,03	9   6
+   (p9)	add		r16 = r29, r26		C 41		9   6
+	;;
+	.pred.rel "mutex", p8, p9
+	ldf8		f35 = [up], 8		C 42	       10   7
+   (p8)	cmp.leu		p6, p7 = r16, r26	C 43	       10   7
+   (p9)	cmp.ltu		p6, p7 = r16, r26	C 44	       10   7
+	getf.sig	r25 = f37		C 45	13     11   7
+	st8		[r20] = r16, 8		C 46	       11   7
+	br.cloop.dptk	.Loop			C 47	       11   7
+C *** MAIN LOOP END ***
+	;;
+.Le0:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r29 = f41		C
+	xma.l		f36 = f32, f6, f44	C
+   (p6)	add		r14 = r30, r27, 1	C
+	ldf8		f47 = [rp], 8		C
+	xma.hu		f40 = f32, f6, f44	C
+   (p7)	add		r14 = r30, r27		C
+	;;
+	.pred.rel "mutex", p6, p7
+   (p6)	cmp.leu		p8, p9 = r14, r27	C
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C
+	getf.sig	r26 = f38		C
+	st8		[r20] = r14, 8		C
+	;;
+	.pred.rel "mutex", p8, p9
+	getf.sig	r30 = f42		C
+	xma.l		f37 = f33, f6, f45	C
+   (p8)	add		r16 = r31, r24, 1	C
+	xma.hu		f41 = f33, f6, f45	C
+   (p9)	add		r16 = r31, r24		C
+	;;
+	.pred.rel "mutex", p8, p9
+   (p8)	cmp.leu		p6, p7 = r16, r24	C
+   (p9)	cmp.ltu		p6, p7 = r16, r24	C
+	getf.sig	r27 = f39		C
+	st8		[r20] = r16, 8		C
+	;;
+.Lcj8:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r31 = f43		C
+	xma.l		f38 = f34, f6, f46	C
+   (p6)	add		r14 = r28, r25, 1	C
+	xma.hu		f42 = f34, f6, f46	C
+   (p7)	add		r14 = r28, r25		C
+	;;
+	.pred.rel "mutex", p6, p7
+   (p6)	cmp.leu		p8, p9 = r14, r25	C
+   (p7)	cmp.ltu		p8, p9 = r14, r25	C
+	getf.sig	r24 = f36		C
+	st8		[r20] = r14, 8		C
+	;;
+.Lcj7:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r28 = f40		C
+	xma.l		f39 = f35, f6, f47	C
+   (p8)	add		r16 = r29, r26, 1	C
+	xma.hu		f43 = f35, f6, f47	C
+   (p9)	add		r16 = r29, r26		C
+	;;
+	.pred.rel "mutex", p8, p9
+   (p8)	cmp.leu		p6, p7 = r16, r26	C
+   (p9)	cmp.ltu		p6, p7 = r16, r26	C
+	getf.sig	r25 = f37		C
+	st8		[r20] = r16, 8		C
+	;;
+.Lcj6:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r29 = f41		C
+   (p6)	add		r14 = r30, r27, 1	C
+   (p7)	add		r14 = r30, r27		C
+	;;
+	.pred.rel "mutex", p6, p7
+   (p6)	cmp.leu		p8, p9 = r14, r27	C
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C
+	getf.sig	r26 = f38		C
+	st8		[r20] = r14, 8		C
+	;;
+.Lcj5:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r30 = f42		C
+   (p8)	add		r16 = r31, r24, 1	C
+   (p9)	add		r16 = r31, r24		C
+	;;
+	.pred.rel "mutex", p8, p9
+   (p8)	cmp.leu		p6, p7 = r16, r24	C
+   (p9)	cmp.ltu		p6, p7 = r16, r24	C
+	getf.sig	r27 = f39		C
+	st8		[r20] = r16, 8		C
+	;;
+.Lcj4:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r8 = f43		C
+   (p6)	add		r14 = r28, r25, 1	C
+   (p7)	add		r14 = r28, r25		C
+	;;
+	.pred.rel "mutex", p6, p7
+	st8		[r20] = r14, 8		C
+   (p6)	cmp.leu		p8, p9 = r14, r25	C
+   (p7)	cmp.ltu		p8, p9 = r14, r25	C
+	;;
+.Lcj3:
+	.pred.rel "mutex", p8, p9
+   (p8)	add		r16 = r29, r26, 1	C
+   (p9)	add		r16 = r29, r26		C
+	;;
+	.pred.rel "mutex", p8, p9
+	st8		[r20] = r16, 8		C
+   (p8)	cmp.leu		p6, p7 = r16, r26	C
+   (p9)	cmp.ltu		p6, p7 = r16, r26	C
+	;;
+.Lcj2:
+	.pred.rel "mutex", p6, p7
+   (p6)	add		r14 = r30, r27, 1	C
+   (p7)	add		r14 = r30, r27		C
+	;;
+	.pred.rel "mutex", p6, p7
+	st8		[r20] = r14		C
+   (p6)	cmp.leu		p8, p9 = r14, r27	C
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C
+	;;
+   (p8)	add		r8 = 1, r8		C M I
+	mov.i		ar.lc = r2		C I0
+	br.ret.sptk.many b0			C B
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/addmul_2.asm b/third_party/gmp/mpn/ia64/addmul_2.asm
new file mode 100644
index 0000000..86e8de4
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/addmul_2.asm
@@ -0,0 +1,715 @@
+dnl  IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
+dnl  add the result to a (n+1)-limb number.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    3.65
+C Itanium 2:  1.625
+
+C TODO
+C  * Clean up variable names, and try to decrease the number of distinct
+C    registers used.
+C  * Clean up feed-in code to not require zeroing several registers.
+C  * Make sure we don't depend on uninitialised predicate registers.
+C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
+C    wind-down code.
+C  * Ultimately rewrite.  The problem with this code is that it first uses a
+C    loaded u value in one xma pair, then leaves it live over several unrelated
+C    xma pairs, before it uses it again.  It should actually be quite possible
+C    to just swap some aligned xma pairs around.  But we should then schedule
+C    u loads further from the first use.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
+define(`vp',`r35')
+
+define(`srp',`r3')
+
+define(`v0',`f6')
+define(`v1',`f7')
+
+define(`s0',`r14')
+define(`acc0',`r15')
+
+define(`pr0_0',`r16') define(`pr0_1',`r17')
+define(`pr0_2',`r18') define(`pr0_3',`r19')
+
+define(`pr1_0',`r20') define(`pr1_1',`r21')
+define(`pr1_2',`r22') define(`pr1_3',`r23')
+
+define(`acc1_0',`r24') define(`acc1_1',`r25')
+define(`acc1_2',`r26') define(`acc1_3',`r27')
+
+dnl define(`',`r28')
+dnl define(`',`r29')
+dnl define(`',`r30')
+dnl define(`',`r31')
+
+define(`fp0b_0',`f8') define(`fp0b_1',`f9')
+define(`fp0b_2',`f10') define(`fp0b_3',`f11')
+
+define(`fp1a_0',`f12') define(`fp1a_1',`f13')
+define(`fp1a_2',`f14') define(`fp1a_3',`f15')
+
+define(`fp1b_0',`f32') define(`fp1b_1',`f33')
+define(`fp1b_2',`f34') define(`fp1b_3',`f35')
+
+define(`fp2a_0',`f36') define(`fp2a_1',`f37')
+define(`fp2a_2',`f38') define(`fp2a_3',`f39')
+
+define(`r_0',`f40') define(`r_1',`f41')
+define(`r_2',`f42') define(`r_3',`f43')
+
+define(`u_0',`f44') define(`u_1',`f45')
+define(`u_2',`f46') define(`u_3',`f47')
+
+define(`rx',`f48')
+define(`ux',`f49')
+define(`ry',`f50')
+define(`uy',`f51')
+
+ASM_START()
+PROLOGUE(mpn_addmul_2s)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi;		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		addp4	vp = 0, vp		C			M I
+}{.mmi;		nop	1
+		nop	1
+		zxt4	n = n			C			I
+	;;
+}')
+
+ {.mmi;		ldf8	ux = [up], 8		C			M
+		ldf8	v0 = [vp], 8		C			M
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		ldf8	rx = [rp], 8		C			M
+		and	r14 = 3, n		C			M I
+		add	n = -2, n		C			M I
+	;;
+}{.mmi;		ldf8	uy = [up], 8		C			M
+		ldf8	v1 = [vp]		C			M
+		shr.u	n = n, 2		C			I0
+}{.mmi;		ldf8	ry = [rp], -8		C			M
+		cmp.eq	p14, p0 = 1, r14	C			M I
+		cmp.eq	p11, p0 = 2, r14	C			M I
+	;;
+}{.mmi;		add	srp = 16, rp		C			M I
+		cmp.eq	p15, p0 = 3, r14	C			M I
+		mov	ar.lc = n		C			I0
+}{.bbb;	(p14)	br.dptk	L(x01)			C			B
+	(p11)	br.dptk	L(x10)			C			B
+	(p15)	br.dptk	L(x11)			C			B
+	;;
+}
+L(x00):		cmp.ne	p6, p0 = r0, r0		C suppress initial xma pair
+		mov	fp2a_3 = f0
+		br	L(b00)
+L(x01):		cmp.ne	p14, p0 = r0, r0	C suppress initial xma pair
+		mov	fp2a_2 = f0
+		br	L(b01)
+L(x10):		cmp.ne	p11, p0 = r0, r0	C suppress initial xma pair
+		mov	fp2a_1 = f0
+		br	L(b10)
+L(x11):		cmp.ne	p15, p0 = r0, r0	C suppress initial xma pair
+		mov	fp2a_0 = f0
+		br	L(b11)
+
+EPILOGUE()
+
+PROLOGUE(mpn_addmul_2)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi;		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		addp4	vp = 0, vp		C			M I
+}{.mmi;		nop	1
+		nop	1
+		zxt4	n = n			C			I
+	;;
+}')
+
+ {.mmi;		ldf8	ux = [up], 8		C			M
+		ldf8	v0 = [vp], 8		C			M
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		ldf8	rx = [rp], 8		C			M
+		and	r14 = 3, n		C			M I
+		add	n = -2, n		C			M I
+	;;
+}{.mmi;		ldf8	uy = [up], 8		C			M
+		ldf8	v1 = [vp]		C			M
+		shr.u	n = n, 2		C			I0
+}{.mmi;		ldf8	ry = [rp], -8		C			M
+		cmp.eq	p14, p0 = 1, r14	C			M I
+		cmp.eq	p11, p0 = 2, r14	C			M I
+	;;
+}{.mmi;		add	srp = 16, rp		C			M I
+		cmp.eq	p15, p6 = 3, r14	C			M I
+		mov	ar.lc = n		C			I0
+}{.bbb;	(p14)	br.dptk	L(b01)			C			B
+	(p11)	br.dptk	L(b10)			C			B
+	(p15)	br.dptk	L(b11)			C			B
+	;;
+}
+	ALIGN(32)
+L(b00):
+ {.mmi;		ldf8	r_1 = [srp], 8
+		ldf8	u_1 = [up], 8
+		mov	acc1_2 = 0
+}{.mmi;		mov	pr1_2 = 0
+		mov	pr0_3 = 0
+		cmp.ne	p8, p9 = r0, r0
+	;;
+}{.mfi;		ldf8	r_2 = [srp], 8
+		xma.l	fp0b_3 = ux, v0, rx
+		cmp.ne	p12, p13 = r0, r0
+}{.mfb;		ldf8	u_2 = [up], 8
+		xma.hu	fp1b_3 = ux, v0, rx
+		br.cloop.dptk	L(gt4)
+}
+		xma.l	fp0b_0 = uy, v0, ry
+		xma.hu	fp1a_0 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_3
+	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_1 = u_1, v0, r_1
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		getfsig	pr1_3 = fp1b_3
+		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, r_2
+		xma.hu	fp1a_2 = u_2, v0, r_2
+		br	L(cj4)
+
+L(gt4):		xma.l	fp0b_0 = uy, v0, ry
+		xma.hu	fp1a_0 = uy, v0, ry
+	;;
+		ldf8	r_3 = [srp], 8
+		getfsig	acc0 = fp0b_3
+	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+		ldf8	u_3 = [up], 8
+	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_1 = u_1, v0, r_1
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;
+		ldf8	r_0 = [srp], 8
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+		xma.l	fp0b_2 = u_2, v0, r_2
+	;;
+		getfsig	acc1_3 = fp2a_3
+		xma.hu	fp1a_2 = u_2, v0, r_2
+		br	L(00)
+
+
+	ALIGN(32)
+L(b01):
+ {.mmi;		ldf8	r_0 = [srp], 8		C M
+		ldf8	u_0 = [up], 8		C M
+		mov	acc1_1 = 0		C M I
+}{.mmi;		mov	pr1_1 = 0		C M I
+		mov	pr0_2 = 0		C M I
+		cmp.ne	p6, p7 = r0, r0		C M I
+	;;
+}{.mfi;		ldf8	r_1 = [srp], 8		C M
+		xma.l	fp0b_2 = ux, v0, rx	C F
+		cmp.ne	p10, p11 = r0, r0	C M I
+}{.mfi;		ldf8	u_1 = [up], 8		C M
+		xma.hu	fp1b_2 = ux, v0, rx	C F
+		nop	1
+	;;
+}		xma.l	fp0b_3 = uy, v0, ry	C F
+		xma.hu	fp1a_3 = uy, v0, ry	C F
+	;;
+ {.mmf;		getfsig	acc0 = fp0b_2		C M
+		ldf8	r_2 = [srp], 8		C M
+	(p14)	xma.hu	fp2a_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
+}{.mfb;		ldf8	u_2 = [up], 8		C M
+	(p14)	xma.l	fp1b_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
+		br.cloop.dptk	L(gt5)
+}
+		xma.l	fp0b_0 = u_0, v0, r_0	C F
+		xma.hu	fp1a_0 = u_0, v0, r_0	C F
+	;;
+		getfsig	pr0_3 = fp0b_3		C M
+		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
+		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
+	;;
+		getfsig	pr1_2 = fp1b_2		C M
+		getfsig	acc1_2 = fp2a_2		C M
+		xma.l	fp0b_1 = u_1, v0, r_1	C F
+		xma.hu	fp1a_1 = u_1, v0, r_1	C F
+		br	L(cj5)
+
+L(gt5):		xma.l	fp0b_0 = u_0, v0, r_0
+		xma.hu	fp1a_0 = u_0, v0, r_0
+	;;
+		getfsig	pr0_3 = fp0b_3
+		ldf8	r_3 = [srp], 8
+		xma.l	fp1b_3 = uy, v1, fp1a_3
+		xma.hu	fp2a_3 = uy, v1, fp1a_3
+	;;
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+		xma.l	fp0b_1 = u_1, v0, r_1
+	;;
+		getfsig	acc1_2 = fp2a_2
+		xma.hu	fp1a_1 = u_1, v0, r_1
+		br	L(01)
+
+
+	ALIGN(32)
+L(b10):		br.cloop.dptk	L(gt2)
+		xma.l	fp0b_1 = ux, v0, rx
+		xma.hu	fp1b_1 = ux, v0, rx
+	;;
+		xma.l	fp0b_2 = uy, v0, ry
+		xma.hu	fp1a_2 = uy, v0, ry
+	;;
+		stf8	[rp] = fp0b_1, 8
+	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	;;
+		getfsig	acc0 = fp0b_2
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		getfsig	pr1_1 = fp1b_1
+		getfsig	acc1_1 = fp2a_1
+		mov	ar.lc = r2
+		getfsig	pr1_2 = fp1b_2
+		getfsig	r8 = fp2a_2
+	;;
+		add	s0 = pr1_1, acc0
+	;;
+		st8	[rp] = s0, 8
+		cmp.ltu	p8, p9 = s0, pr1_1
+		sub	r31 = -1, acc1_1
+	;;
+	.pred.rel "mutex", p8, p9
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	(p8)	cmp.leu	p10, p0 = r31, pr1_2
+	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
+	;;
+		st8	[rp] = acc0, 8
+	(p10)	add	r8 = 1, r8
+		br.ret.sptk.many b0
+
+
+L(gt2):
+ {.mmi;		ldf8	r_3 = [srp], 8
+		ldf8	u_3 = [up], 8
+		mov	acc1_0 = 0
+	;;
+}{.mfi;		ldf8	r_0 = [srp], 8
+		xma.l	fp0b_1 = ux, v0, rx
+		mov	pr1_0 = 0
+}{.mfi;		ldf8	u_0 = [up], 8
+		xma.hu	fp1b_1 = ux, v0, rx
+		mov	pr0_1 = 0
+	;;
+}		xma.l	fp0b_2 = uy, v0, ry
+		xma.hu	fp1a_2 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_1
+		ldf8	r_1 = [srp], 8
+	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	;;
+		ldf8	u_1 = [up], 8
+		xma.l	fp0b_3 = u_3, v0, r_3
+		xma.hu	fp1a_3 = u_3, v0, r_3
+	;;
+		getfsig	pr0_2 = fp0b_2
+		ldf8	r_2 = [srp], 8
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	;;
+ {.mfi;		getfsig	acc1_1 = fp2a_1
+		xma.l	fp0b_0 = u_0, v0, r_0
+		cmp.ne	p8, p9 = r0, r0
+}{.mfb;		cmp.ne	p12, p13 = r0, r0
+		xma.hu	fp1a_0 = u_0, v0, r_0
+		br.cloop.sptk.clr	L(top)
+}
+		br.many	L(end)
+
+
+	ALIGN(32)
+L(b11):		ldf8	r_2 = [srp], 8
+		mov	pr1_3 = 0
+		mov	pr0_0 = 0
+	;;
+		ldf8	u_2 = [up], 8
+		mov	acc1_3 = 0
+		br.cloop.dptk	L(gt3)
+	;;
+		cmp.ne	p6, p7 = r0, r0
+		xma.l	fp0b_0 = ux, v0, rx
+		xma.hu	fp1b_0 = ux, v0, rx
+	;;
+		cmp.ne	p10, p11 = r0, r0
+		xma.l	fp0b_1 = uy, v0, ry
+		xma.hu	fp1a_1 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_0
+	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_2 = uy, v1, r_2
+		xma.hu	fp1a_2 = uy, v1, r_2
+	;;
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_2, v0, fp1a_1
+		xma.hu	fp2a_1 = u_2, v0, fp1a_1
+	;;
+		getfsig	pr1_0 = fp1b_0
+		getfsig	acc1_0 = fp2a_0
+		br	L(cj3)
+
+L(gt3):		ldf8	r_3 = [srp], 8
+		xma.l	fp0b_0 = ux, v0, rx
+		cmp.ne	p10, p11 = r0, r0
+		ldf8	u_3 = [up], 8
+		xma.hu	fp1b_0 = ux, v0, rx
+		cmp.ne	p6, p7 = r0, r0
+	;;
+		xma.l	fp0b_1 = uy, v0, ry
+		xma.hu	fp1a_1 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_0
+		ldf8	r_0 = [srp], 8
+	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+		ldf8	u_0 = [up], 8
+	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_2 = u_2, v0, r_2
+		xma.hu	fp1a_2 = u_2, v0, r_2
+	;;
+		getfsig	pr0_1 = fp0b_1
+		ldf8	r_1 = [srp], 8
+		xma.l	fp1b_1 = uy, v1, fp1a_1
+		xma.hu	fp2a_1 = uy, v1, fp1a_1
+	;;
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	;;
+		getfsig	acc1_0 = fp2a_0
+		xma.l	fp0b_3 = u_3, v0, r_3
+		xma.hu	fp1a_3 = u_3, v0, r_3
+		br	L(11)
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):						C 00
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_3 = fp0b_3
+		ldf8	r_3 = [srp], 8
+		xma.l	fp1b_3 = u_3, v1, fp1a_3
+	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		xma.hu	fp2a_3 = u_3, v1, fp1a_3
+	;;					C 01
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;					C 02
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_2 = fp2a_2
+		st8	[rp] = s0, 8
+		xma.l	fp0b_1 = u_1, v0, r_1
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;					C 03
+L(01):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_0 = fp0b_0
+		ldf8	r_0 = [srp], 8
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+	;;					C 04
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;					C 05
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_3 = fp2a_3
+		st8	[rp] = s0, 8
+		xma.l	fp0b_2 = u_2, v0, r_2
+	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, r_2
+	;;					C 06
+L(00):
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_1 = fp0b_1
+		ldf8	r_1 = [srp], 8
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+	;;					C 07
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;					C 08
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_0 = fp2a_0
+		st8	[rp] = s0, 8
+		xma.l	fp0b_3 = u_3, v0, r_3
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+		xma.hu	fp1a_3 = u_3, v0, r_3
+	;;					C 09
+L(11):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_2 = fp0b_2
+		ldf8	r_2 = [srp], 8
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+	;;					C 10
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;					C 11
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_1 = fp2a_1
+		st8	[rp] = s0, 8
+		xma.l	fp0b_0 = u_0, v0, r_0
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+		xma.hu	fp1a_0 = u_0, v0, r_0
+L(10):		br.cloop.sptk.clr	L(top)	C 12
+	;;
+C *** MAIN LOOP END ***
+L(end):
+	.pred.rel "mutex", p12, p13
+ {.mfi;		getfsig	pr0_3 = fp0b_3
+		xma.l	fp1b_3 = u_3, v1, fp1a_3
+	(p12)	add	s0 = pr1_0, acc0, 1
+}{.mfi;	(p13)	add	s0 = pr1_0, acc0
+		xma.hu	fp2a_3 = u_3, v1, fp1a_3
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mfi;		getfsig	acc1_2 = fp2a_2
+		xma.l	fp0b_1 = u_1, v0, r_1
+		nop	1
+}{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;
+}
+L(cj5):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_3 = fp1b_3
+	st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mfi;		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, r_2
+		nop	1
+}{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, r_2
+	;;
+}
+L(cj4):
+	.pred.rel "mutex", p12, p13
+ {.mfi;		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+}{.mfi;	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_0 = fp1b_0
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	acc1_0 = fp2a_0
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+	;;
+}
+L(cj3):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_1 = fp1b_1
+		st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		getfsig	acc1_1 = fp2a_1
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+	;;
+}	.pred.rel "mutex", p12, p13
+ {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	r8 = fp2a_2
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		st8	[rp] = s0, 8
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
+	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
+	(p12)	add	acc0 = 1, acc0
+	;;
+}{.mmi;		st8	[rp] = acc0, 8
+	(p12)	cmpeqor	p10, p0 = 0, acc0
+		nop	1
+	;;
+}{.mib;	(p10)	add	r8 = 1, r8
+		mov	ar.lc = r2
+		br.ret.sptk.many b0
+}
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/aors_n.asm b/third_party/gmp/mpn/ia64/aors_n.asm
new file mode 100644
index 0000000..7705ce6
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aors_n.asm
@@ -0,0 +1,852 @@
+dnl  IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      2.67
+C Itanium 2:    1.25
+
+C TODO
+C  * Consider using special code for small n, using something like
+C    "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
+C  * The non-nc code was trimmed cycle for cycle to its current state.  It is
+C    probably hard to save more that an odd cycle there.  The nc code is much
+C    cruder (since tune/speed doesn't have any applicable direct measurements).
+C  * Without the nc entry points, this becomes around 1800 bytes of object
+C    code; the nc code adds over 1000 bytes.  We should perhaps sacrifice a
+C    few cycles for the non-nc code and let it fall into the nc code.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n',  `r35')
+define(`cy', `r36')
+
+ifdef(`OPERATION_add_n',`
+  define(ADDSUB,	add)
+  define(CND,		ltu)
+  define(INCR,		1)
+  define(LIM,		-1)
+  define(LIM2,		0)
+  define(func,    mpn_add_n)
+  define(func_nc, mpn_add_nc)
+')
+ifdef(`OPERATION_sub_n',`
+  define(ADDSUB,	sub)
+  define(CND,		gtu)
+  define(INCR,		-1)
+  define(LIM,		0)
+  define(LIM2,		-1)
+  define(func,    mpn_sub_n)
+  define(func_nc, mpn_sub_nc)
+')
+
+define(PFDIST, 500)
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
+define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
+define(`rpx',`r3')
+define(`upadv',`r20') define(`vpadv',`r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		nop.i	0
+		addp4	vp = 0, vp		C			M I
+		nop.m	0
+		zxt4	n = n			C			I
+	;;
+')
+
+ {.mmi;		ld8	r11 = [vp], 8		C			M01
+		ld8	r10 = [up], 8		C			M01
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		and	r14 = 7, n		C			M I
+		cmp.lt	p15, p14 = 8, n		C			M I
+		add	n = -6, n		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C Merging these lines into the feed-in
+		add	vpadv = PFDIST, vp	C code could save a cycle per call at
+		mov	r23 = cy		C the expense of code size.
+	;;
+}{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
+		cmp.eq	p7, p0 = 2, r14		C			M I
+		cmp.eq	p8, p0 = 3, r14		C			M I
+}{.bbb;	(p6)	br.dptk	.Lc001			C			B
+	(p7)	br.dptk	.Lc010			C			B
+	(p8)	br.dptk	.Lc011			C			B
+	;;
+}{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
+		cmp.eq	p10, p0 = 5, r14	C			M I
+		cmp.eq	p11, p0 = 6, r14	C			M I
+}{.bbb;	(p9)	br.dptk	.Lc100			C			B
+	(p10)	br.dptk	.Lc101			C			B
+	(p11)	br.dptk	.Lc110			C			B
+	;;
+}{.mmi;		ld8	r19 = [vp], 8		C			M01
+		ld8	r18 = [up], 8		C			M01
+		cmp.ne	p13, p0 = 0, cy		C copy cy to p13	M I
+}{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
+		nop	0
+	(p12)	br.dptk	.Lc111			C			B
+	;;
+}
+
+.Lc000:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	vpadv = PFDIST, vp	C			M I
+		ld8	v0 = [vp], 8		C			M01
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = r10, r11		C			M I
+		nop	0
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C			M I
+		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, r10	C			M I
+}{.mmi;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r18, r19		C			M I
+		add	rpx = 8, rp		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, r18	C			M I
+	(p13)	cmpeqor	p7, p0 = LIM, w1	C			M I
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+	(p13)	add	w1 = INCR, w1		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m0)
+}
+
+.Lc001:
+ {.mmi;	(p15)	ld8	v1 = [vp], 8		C			M01
+	(p15)	ld8	u1 = [up], 8		C			M01
+		ADDSUB	w0 = r10, r11		C			M I
+}{.mmb;		nop	0
+		nop	0
+	(p15)	br	L(0)
+	;;
+}{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
+		mov	r8 = 0
+		cmp.CND	p6, p0 = w0, r10	C			M I
+	;;
+}{.mmb;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+	(p9)	add	w0 = INCR, w0		C			M I
+		br	L(cj1)			C			B
+}
+L(0):
+ {.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+}{.mmi;		nop	0
+		cmp.ne	p9, p0 = 0, r23		C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+		add	rpx = 16, rp		C			M I
+}{.mmb;		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+		br	L(c1)			C			B
+}
+
+.Lc010:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		mov	r8 = 0			C			M I
+}{.mmb;		ADDSUB	w3 = r10, r11		C			M I
+		cmp.ne	p8, p0 = 0, r23		C			M I
+	(p15)	br	L(1)			C			B
+	;;
+}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+	;;
+}{.mmb;		cmp.CND	p6, p0 = w0, u0		C			M I
+	(p8)	cmpeqor	p9, p0 = LIM2, w3	C			M I
+		br	L(cj2)			C			B
+}
+L(1):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		cmp.CND	p9, p0 = w3, r10	C			M I
+	;;
+}{.mmi;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmb;		add	rpx = 24, rp		C			M I
+		nop	0
+		br	L(m23)			C			B
+}
+
+.Lc011:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+}{.mmi;		ADDSUB	w2 = r10, r11		C			M I
+		cmp.ne	p7, p0 = 0, r23		C			M I
+		nop	0
+	;;
+}{.mmb;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+	(p15)	br	L(2)			C			B
+}{.mmi;		cmp.CND	p8, p0 = w2, r10	C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+		nop	0
+	;;
+}{.mmb;	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(cj3)			C			B
+}
+L(2):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u3 = [up], 8		C			M01
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+	(p7)	add	w2 = INCR, w2		C			M I
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m23)
+}
+
+.Lc100:
+ {.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+}{.mmi;		ADDSUB	w1 = r10, r11		C			M I
+		nop	0
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		add	rpx = 8, rp		C			M I
+}{.mmi;		cmp.ne	p6, p0 = 0, r23		C			M I
+		cmp.CND	p7, p0 = w1, r10	C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w2 = u2, v2		C			M I
+}{.mmb;	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+	(p6)	add	w1 = INCR, w1		C			M I
+	(p14)	br	L(cj4)
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+		nop	0
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		nop	0
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m4)
+}
+
+.Lc101:
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		ADDSUB	w0 = r10, r11		C			M I
+}{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
+		add	rpx = 16, rp		C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+}{.mbb;		ADDSUB	w1 = u1, v1		C			M I
+	(p15)	br	L(c5)			C			B
+		br	L(end)			C			B
+}
+
+.Lc110:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C			M I
+		add	vpadv = PFDIST, vp	C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w3 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
+		cmp.ne	p8, p0 = 0, r23		C			M I
+		add	rpx = 24, rp		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		nop	0
+}{.mmb;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+		br	L(m67)			C			B
+}
+
+.Lc111:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C			M I
+		ld8	v1 = [vp], 8		C			M01
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r10, r11		C			M I
+		nop	0
+	;;
+}{.mmi;		add	vpadv = PFDIST, vp	C			M I
+		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = r18, r19		C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, r18	C			M I
+	(p13)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmi;		ld8	u3 = [up], 8		C			M01
+	(p13)	add	w2 = INCR, w2		C			M I
+		nop	0
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m67)
+}
+EPILOGUE()
+
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		nop.i	0
+		addp4	vp = 0, vp		C			M I
+		nop.m	0
+		zxt4	n = n			C			I
+	;;
+')
+
+ {.mmi;		ld8	r11 = [vp], 8		C			M01
+		ld8	r10 = [up], 8		C			M01
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		and	r14 = 7, n		C			M I
+		cmp.lt	p15, p14 = 8, n		C			M I
+		add	n = -6, n		C			M I
+	;;
+}{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
+		cmp.eq	p7, p0 = 2, r14		C			M I
+		cmp.eq	p8, p0 = 3, r14		C			M I
+}{.bbb;	(p6)	br.dptk	.Lb001			C			B
+	(p7)	br.dptk	.Lb010			C			B
+	(p8)	br.dptk	.Lb011			C			B
+	;;
+}{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
+		cmp.eq	p10, p0 = 5, r14	C			M I
+		cmp.eq	p11, p0 = 6, r14	C			M I
+}{.bbb;	(p9)	br.dptk	.Lb100			C			B
+	(p10)	br.dptk	.Lb101			C			B
+	(p11)	br.dptk	.Lb110			C			B
+	;;
+}{.mmi;		ld8	r19 = [vp], 8		C			M01
+		ld8	r18 = [up], 8		C			M01
+		cmp.ne	p13, p0 = r0, r0	C clear "CF"		M I
+}{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
+		mov	r23 = 0			C			M I
+	(p12)	br.dptk	.Lb111			C			B
+	;;
+}
+
+.Lb000:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, r10	C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r18, r19		C			M I
+		add	rpx = 8, rp		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		cmp.CND	p8, p0 = w2, r18	C			M I
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m0)			C			B
+}
+
+	ALIGN(32)
+.Lb001:
+ {.mmi;		ADDSUB	w0 = r10, r11		C			M I
+	(p15)	ld8	v1 = [vp], 8		C			M01
+		mov	r8 = 0			C			M I
+	;;
+}{.mmb;		cmp.CND	p6, p0 = w0, r10	C			M I
+	(p15)	ld8	u1 = [up], 8		C			M01
+	(p14)	br	L(cj1)			C			B
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		shr.u	n = n, 3		C			I0
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+}{.mmb;		ld8	u1 = [up], 8		C			M01
+		add	rpx = 16, rp		C			M I
+		br	L(m1)			C			B
+}
+
+	ALIGN(32)
+.Lb010:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+}{.mmb;		ADDSUB	w3 = r10, r11		C			M I
+		nop	0
+	(p15)	br	L(gt2)			C			B
+	;;
+}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		mov	r8 = 0			C			M I
+	;;
+}{.mmb;		nop	0
+		cmp.CND	p6, p0 = w0, u0		C			M I
+		br	L(cj2)			C			B
+}
+L(gt2):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+		add	rpx = 24, rp		C			M I
+		br	L(m23)			C			B
+}
+
+	ALIGN(32)
+.Lb011:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		ADDSUB	w2 = r10, r11		C			M I
+	;;
+}{.mmb;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+	(p15)	br	L(3)			C			B
+}{.mmb;		cmp.CND	p8, p0 = w2, r10	C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+		br	L(cj3)			C			B
+}
+L(3):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		ADDSUB	w3 = u3, v3		C			M I
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u3 = [up], 8		C			M01
+		nop	0
+		nop	0
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m23)			C			B
+}
+
+	ALIGN(32)
+.Lb100:
+ {.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		ADDSUB	w1 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		cmp.CND	p7, p0 = w1, r10	C			M I
+}{.mmb;		nop	0
+		ADDSUB	w2 = u2, v2		C			M I
+	(p14)	br	L(cj4)			C			B
+	;;
+}
+L(gt4):
+ {.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+		nop	0
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = u3, v3		C			M I
+		add	rpx = 8, rp		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m4)			C			B
+}
+
+	ALIGN(32)
+.Lb101:
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w0 = r10, r11		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		add	rpx = 16, rp		C			M I
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+		nop	0
+}{.mmb;		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+	(p14)	br	L(cj5)			C			B
+	;;
+}
+L(gt5):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+		mov	ar.lc = n		C			I0
+}{.mmb;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = u2, v2		C			M I
+		br	L(m5)			C			B
+}
+
+	ALIGN(32)
+.Lb110:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w3 = r10, r11		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+		add	rpx = 24, rp		C			M I
+		br	L(m67)			C			B
+}
+
+	ALIGN(32)
+.Lb111:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = r18, r19		C			M I
+		nop	0
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		nop	0
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		cmp.CND	p9, p0 = w3, r18	C			M I
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m67)			C			B
+}
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):
+L(c5):		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+		ld8	u1 = [up], 8		C			M01
+	(p9)	add	w0 = INCR, w0		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+	;;
+L(m5):		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+		ld8	u2 = [up], 8		C			M01
+	(p6)	add	w1 = INCR, w1		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+		st8	[rp] = w0, 8		C			M23
+		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+	;;
+L(m4):		st8	[rp] = w1, 16		C			M23
+		st8	[rpx] = w2, 32		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+		lfetch	[upadv], 64
+	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+	;;
+L(m23):		st8	[rp] = w3, 8		C			M23
+		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, u0		C			M I
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+		nop.b	0
+	;;
+L(c1):		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+		ld8	u1 = [up], 8		C			M01
+	(p9)	add	w0 = INCR, w0		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+	;;
+L(m1):		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+		ld8	u2 = [up], 8		C			M01
+	(p6)	add	w1 = INCR, w1		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+		st8	[rp] = w0, 8		C			M23
+		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+	;;
+L(m0):		st8	[rp] = w1, 16		C			M23
+		st8	[rpx] = w2, 32		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+		lfetch	[vpadv], 64
+	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+	;;
+L(m67):		st8	[rp] = w3, 8		C			M23
+		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, u0		C			M I
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+		br.cloop.dptk	L(top)		C			B
+	;;
+C *** MAIN LOOP END ***
+
+L(end):
+ {.mmi;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+	(p9)	add	w0 = INCR, w0		C			M I
+		mov	ar.lc = r2		C			I0
+}
+L(cj5):
+ {.mmi;		cmp.CND	p7, p0 = w1, u1		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+		nop	0
+	;;
+}{.mmi;		st8	[rp] = w0, 8		C			M23
+	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+	(p6)	add	w1 = INCR, w1		C			M I
+}
+L(cj4):
+ {.mmi;		cmp.CND	p8, p0 = w2, u2		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+		nop	0
+	;;
+}{.mmi;		st8	[rp] = w1, 8		C			M23
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+	(p7)	add	w2 = INCR, w2		C			M I
+}
+L(cj3):
+ {.mmi;		cmp.CND	p9, p0 = w3, u3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		nop	0
+	;;
+}{.mmi;		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+}{.mmi;		cmp.CND	p6, p0 = w0, u0		C			M I
+		nop	0
+		mov	r8 = 0			C			M I
+	;;
+}
+L(cj2):
+ {.mmi;		st8	[rp] = w3, 8		C			M23
+	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+	(p9)	add	w0 = INCR, w0		C			M I
+	;;
+}
+L(cj1):
+ {.mmb;		st8	[rp] = w0, 8		C			M23
+	(p6)	mov	r8 = 1			C			M I
+		br.ret.sptk.many b0		C			B
+}
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/aorsorrlsh1_n.asm b/third_party/gmp/mpn/ia64/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..9b58b9e
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aorsorrlsh1_n.asm
@@ -0,0 +1,48 @@
+dnl  IA-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      3.0
+C Itanium 2:    1.5
+
+
+define(LSH,		1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`ia64/aorsorrlshC_n.asm')
diff --git a/third_party/gmp/mpn/ia64/aorsorrlsh2_n.asm b/third_party/gmp/mpn/ia64/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..39b384a
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aorsorrlsh2_n.asm
@@ -0,0 +1,48 @@
+dnl  IA-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      3.0
+C Itanium 2:    1.5
+
+
+define(LSH,		2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`ia64/aorsorrlshC_n.asm')
diff --git a/third_party/gmp/mpn/ia64/aorsorrlshC_n.asm b/third_party/gmp/mpn/ia64/aorsorrlshC_n.asm
new file mode 100644
index 0000000..2703ce2
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aorsorrlshC_n.asm
@@ -0,0 +1,412 @@
+dnl  IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1.5
+
+C TODO
+C  * Use shladd in feed-in code (for mpn_addlshC_n).
+C  * Rewrite loop to schedule loads closer to use, since we do prefetch.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n',  `r35')
+
+ifdef(`DO_add', `
+  define(`ADDSUB',     `add	$1 = $2, $3')
+  define(`CMP',        `cmp.ltu	$1,p0 = $2, $3')
+  define(`INCR',       1)
+  define(`LIM',        -1)
+  define(`func',        mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADDSUB',     `sub	$1 = $2, $3')
+  define(`CMP',        `cmp.gtu	$1,p0 = $2, $3')
+  define(`INCR',       -1)
+  define(`LIM',        0)
+  define(`func',        mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADDSUB',     `sub	$1 = $3, $2')
+  define(`CMP',        `cmp.gtu	$1,p0 = $2, $4')
+  define(`INCR',       -1)
+  define(`LIM',        0)
+  define(`func',        mpn_rsblsh`'LSH`'_n)')
+
+define(PFDIST, 500)
+
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
+define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
+define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
+define(`x0',`r30') define(`x1',`r31') define(`x2',`r3')  define(`x3',`r9')
+
+C r3 r8 r9 r10 r11
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4	rp = 0, rp		C			M I
+	addp4	up = 0, up		C			M I
+	nop.i	0
+	addp4	vp = 0, vp		C			M I
+	nop.m	0
+	zxt4	n = n			C			I
+	;;
+')
+ {.mmi;	ld8	r11 = [vp], 8		C			M01
+	ld8	r10 = [up], 8		C			M01
+	mov.i	r2 = ar.lc		C			I0
+}{.mmi;	and	r14 = 3, n		C			M I
+	cmp.lt	p15, p0 = 4, n		C			M I
+	add	n = -5, n		C			M I
+	;;
+}{.mmi;	cmp.eq	p6, p0 = 1, r14		C			M I
+	cmp.eq	p7, p0 = 2, r14		C			M I
+	cmp.eq	p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk	.Lb01			C			B
+  (p7)	br.dptk	.Lb10			C			B
+  (p8)	br.dptk	.Lb11			C			B
+}
+
+.Lb00:
+ {.mmi;	ld8	v0 = [vp], 8		C			M01
+	ld8	u0 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+	;;
+}{.mmi;	ld8	v1 = [vp], 8		C			M01
+	ld8	u1 = [up], 8		C			M01
+	shl	x3 = r11, LSH		C			I0
+	;;
+}{.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shrp	x0 = v0, r11, 64-LSH	C			I0
+}{.mmb;	ADDSUB(	w3, r10, x3)		C			M I
+	nop	0
+  (p15)	br.dpnt	.grt4			C			B
+	;;
+}{.mii;	CMP(	p7, w3, r10, x3)	C			M II0
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+	ADDSUB(	w0, u0, x0)		C			M I
+	;;
+}{.mii;	CMP(	p8, w0, u0, x0)		C			M I
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+	ADDSUB(	w1, u1, x1)		C			M I
+}{.mmb;	nop	0
+	nop	0
+	br	.Lcj4			C			B
+}
+ALIGN(32)
+.grt4:
+ {.mii;	ld8	v3 = [vp], 8		C			M01
+	shrp	x0 = v0, r11, 64-LSH	C			I0
+	CMP(	p8, w3, r10, x3)	C			M I
+	;;
+}{.mmi;	ld8	u3 = [up], 8		C			M01
+	add	r11 = PFDIST, vp
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+}{.mmi;	ld8	v0 = [vp], 8		C			M01
+	ADDSUB(	w0, u0, x0)		C			M I
+	nop	0
+	;;
+}{.mmi;	CMP(	p6, w0, u0, x0)		C			M I
+	add	r10 = PFDIST, up
+	mov.i	ar.lc = n		C			I0
+}{.mmb;	ADDSUB(	w1, u1, x1)		C			M I
+	ld8	u0 = [up], 8		C			M01
+	br	.LL00			C			B
+}
+
+	ALIGN(32)
+.Lb01:
+ifdef(`DO_add',
+`	shladd	w2 = r11, LSH, r10	C			M I
+	shr.u	r8 = r11, 64-LSH	C retval		I0
+  (p15)	br.dpnt	.grt1			C			B
+	;;
+',`
+	shl	x2 = r11, LSH		C			I0
+  (p15)	br.dpnt	.grt1			C			B
+	;;
+	ADDSUB(	w2, r10, x2)		C			M I
+	shr.u	r8 = r11, 64-LSH	C retval		I0
+	;;
+')
+	CMP(	p6, w2, r10, x2)	C			M I
+	br		.Lcj1
+
+.grt1:	ld8	v3 = [vp], 8		C			M01
+	ld8	u3 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+	;;
+	ld8	v0 = [vp], 8		C			M01
+	ld8	u0 = [up], 8		C			M01
+	mov.i	ar.lc = n		C FIXME swap with next	I0
+ifdef(`DO_add',
+`',`
+	ADDSUB(	w2, r10, x2)
+')
+	;;
+ {.mmi;	ld8	v1 = [vp], 8		C			M01
+	ld8	u1 = [up], 8		C			M01
+	shrp	x3 = v3, r11, 64-LSH	C			I0
+	;;
+}{.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mmb;	CMP(	p6, w2, r10, x2)	C			M I
+	ADDSUB(	w3, u3, x3)		C			M I
+	br.cloop.dptk	.grt5		C			B
+	;;
+}{.mmi;	CMP(	p7, w3, u3, x3)		C			M I
+	ADDSUB(	w0, u0, x0)		C			M I
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+}{.mmb;	nop	0
+	nop	0
+	br	.Lcj5			C			B
+}
+.grt5:
+ {.mmi;	add	r10 = PFDIST, up
+	add	r11 = PFDIST, vp
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mmb;	ld8	v3 = [vp], 8		C			M01
+	CMP(	p8, w3, u3, x3)		C			M I
+	br	.LL01			C			B
+}
+	ALIGN(32)
+.Lb10:
+ {.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shl	x1 = r11, LSH		C			I0
+}{.mmb;	nop	0
+	nop	0
+  (p15)	br.dpnt	.grt2			C			B
+	;;
+}{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
+	nop	0
+	shrp	x2 = v2, r11, 64-LSH	C			I0
+	;;
+}{.mmi;	CMP(	p9, w1, r10, x1)	C			M I
+	ADDSUB(	w2, u2, x2)		C			M I
+	shr.u	r8 = v2, 64-LSH		C retval		I0
+	;;
+}{.mmb;	CMP(	p6, w2, u2, x2)		C			M I
+	nop	0
+	br	.Lcj2			C			B
+}
+.grt2:
+ {.mmi;	ld8	v3 = [vp], 8		C			M01
+	ld8	u3 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+	;;
+}{.mmi;	ld8	v0 = [vp], 8		C			M01
+	ld8	u0 = [up], 8		C			M01
+	mov.i	ar.lc = n		C			I0
+}{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
+	nop	0
+	nop	0
+	;;
+}{.mii;	ld8	v1 = [vp], 8		C			M01
+	shrp	x2 = v2, r11, 64-LSH	C			I0
+	CMP(	p8, w1, r10, x1)	C			M I
+	;;
+}{.mmi;	add	r10 = PFDIST, up
+	ld8	u1 = [up], 8		C			M01
+	shrp	x3 = v3, v2, 64-LSH	C			I0
+}{.mmi;	add	r11 = PFDIST, vp
+	ld8	v2 = [vp], 8		C			M01
+	ADDSUB(	w2, u2, x2)		C			M I
+	;;
+}{.mmi;	CMP(	p6, w2, u2, x2)		C			M I
+	ld8	u2 = [up], 8		C			M01
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mib;	ADDSUB(	w3, u3, x3)		C			M I
+	nop	0
+	br.cloop.dpnt	L(top)		C			B
+}
+	br	L(end)			C			B
+.Lb11:
+ {.mmi;	ld8	v1 = [vp], 8		C			M01
+	ld8	u1 = [up], 8		C			M01
+	shl	x0 = r11, LSH		C			I0
+	;;
+}{.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+}{.mmb;	nop	0
+	nop	0
+  (p15)	br.dpnt	.grt3			C			B
+	;;
+}{.mii;	nop	0
+	shrp	x1 = v1, r11, 64-LSH	C			I0
+	ADDSUB(	w0, r10, x0)		C			M I
+	;;
+}{.mii;	CMP(	p8, w0, r10, x0)	C			M I
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+	ADDSUB(	w1, u1, x1)		C			M I
+	;;
+}{.mmb;	CMP(	p9, w1, u1, x1)		C			M I
+	ADDSUB(	w2, u2, x2)		C			M I
+	br	.Lcj3			C			B
+}
+.grt3:
+ {.mmi;	ld8	v3 = [vp], 8		C			M01
+	ld8	u3 = [up], 8		C			M01
+	shrp	x1 = v1, r11, 64-LSH	C			I0
+}{.mmi;	ADDSUB(	w0, r10, x0)		C			M I
+	nop	0
+	nop	0
+	;;
+}{.mmi;	ld8	v0 = [vp], 8		C			M01
+	CMP(	p6, w0, r10, x0)	C			M I
+	mov.i	ar.lc = n		C			I0
+}{.mmi;	ld8	u0 = [up], 8		C			M01
+	ADDSUB(	w1, u1, x1)		C			M I
+	nop	0
+	;;
+}{.mmi;	add	r10 = PFDIST, up
+	add	r11 = PFDIST, vp
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+}{.mmb;	ld8	v1 = [vp], 8		C			M01
+	CMP(	p8, w1, u1, x1)		C			M I
+	br	.LL11			C			B
+}
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):	st8	[rp] = w1, 8		C			M23
+	lfetch	[r10], 32
+   (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
+   (p8)	add	w2 = INCR, w2		C			M I
+	ld8	v3 = [vp], 8		C			M01
+	CMP(	p8, w3, u3, x3)		C			M I
+	;;
+.LL01:	ld8	u3 = [up], 8		C			M01
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+   (p6)	cmpeqor	p8, p0 = LIM, w3	C			M I
+   (p6)	add	w3 = INCR, w3		C			M I
+	ld8	v0 = [vp], 8		C			M01
+	ADDSUB(	w0, u0, x0)		C			M I
+	;;
+	st8	[rp] = w2, 8		C			M23
+	CMP(	p6, w0, u0, x0)		C			M I
+	nop.b	0
+	ld8	u0 = [up], 8		C			M01
+	lfetch	[r11], 32
+	ADDSUB(	w1, u1, x1)		C			M I
+	;;
+.LL00:	st8	[rp] = w3, 8		C			M23
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+   (p8)	cmpeqor	p6, p0 = LIM, w0	C			M I
+   (p8)	add	w0 = INCR, w0		C			M I
+	ld8	v1 = [vp], 8		C			M01
+	CMP(	p8, w1, u1, x1)		C			M I
+	;;
+.LL11:	ld8	u1 = [up], 8		C			M01
+	shrp	x3 = v3, v2, 64-LSH	C			I0
+   (p6)	cmpeqor	p8, p0 = LIM, w1	C			M I
+   (p6)	add	w1 = INCR, w1		C			M I
+	ld8	v2 = [vp], 8		C			M01
+	ADDSUB(	w2, u2, x2)		C			M I
+	;;
+ {.mmi;	st8	[rp] = w0, 8		C			M23
+	CMP(	p6, w2, u2, x2)		C			M I
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mib;
+	ld8	u2 = [up], 8		C			M01
+	ADDSUB(	w3, u3, x3)		C			M I
+	br.cloop.dptk	L(top)		C			B
+	;;
+}
+C *** MAIN LOOP END ***
+
+L(end):
+ {.mmi;	st8	[rp] = w1, 8		C			M23
+   (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+}{.mmi;
+   (p8)	add	w2 = INCR, w2		C			M I
+	CMP(	p7, w3, u3, x3)		C			M I
+	ADDSUB(	w0, u0, x0)		C			M I
+	;;
+}
+.Lcj5:
+ {.mmi;	st8	[rp] = w2, 8		C			M23
+   (p6)	cmpeqor	p7, p0 = LIM, w3	C			M I
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+}{.mmi;
+   (p6)	add	w3 = INCR, w3		C			M I
+	CMP(	p8, w0, u0, x0)		C			M I
+	ADDSUB(	w1, u1, x1)		C			M I
+	;;
+}
+.Lcj4:
+ {.mmi;	st8	[rp] = w3, 8		C			M23
+   (p7)	cmpeqor	p8, p0 = LIM, w0	C			M I
+	mov.i	ar.lc = r2		C			I0
+}{.mmi;
+   (p7)	add	w0 = INCR, w0		C			M I
+	CMP(	p9, w1, u1, x1)		C			M I
+	ADDSUB(	w2, u2, x2)		C			M I
+	;;
+}
+.Lcj3:
+ {.mmi;	st8	[rp] = w0, 8		C			M23
+   (p8)	cmpeqor	p9, p0 = LIM, w1	C			M I
+	shr.u	r8 = v2, 64-LSH		C			I0
+}{.mmi;
+   (p8)	add	w1 = INCR, w1		C			M I
+	CMP(	p6, w2, u2, x2)		C			M I
+	nop	0
+	;;
+}
+.Lcj2:
+ {.mmi;	st8	[rp] = w1, 8		C			M23
+   (p9)	cmpeqor	p6, p0 = LIM, w2	C			M I
+   (p9)	add	w2 = INCR, w2		C			M I
+	;;
+}
+.Lcj1:
+ {.mmb;	st8	[rp] = w2		C			M23
+ifdef(`DO_rsb',`
+   (p6)	add	r8 = -1, r8		C			M I
+',`
+   (p6)	add	r8 = 1, r8		C			M I
+')	br.ret.sptk.many b0		C			B
+}
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/bdiv_dbm1c.asm b/third_party/gmp/mpn/ia64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..47e4553
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/bdiv_dbm1c.asm
@@ -0,0 +1,516 @@
+dnl  IA-64 mpn_bdiv_dbm1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2009 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    4
+C Itanium 2:  2
+
+C TODO
+C  * Optimize feed-in and wind-down code, both for speed and code size.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`bd', `r35')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+	.prologue
+	.save		ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmb
+	mov		r15 = r36		C M I
+	ldf8		f9 = [up], 8		C M
+	nop.b		0			C B
+}
+.Lcommon:
+{.mii
+	adds		r16 = -1, n		C M I
+	mov		r2 = ar.lc		C I0
+	and		r14 = 3, n		C M I
+	;;
+}
+{.mii
+	setf.sig	f6 = bd			C M2 M3
+	shr.u		r31 = r16, 2		C I0
+	cmp.eq		p10, p0 = 0, r14	C M I
+}
+{.mii
+	nop.m		0			C M
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	;;
+}
+{.mii
+	cmp.ne		p6, p7 = r0, r0		C M I
+	mov.i		ar.lc = r31		C I0
+	cmp.ne		p8, p9 = r0, r0		C M I
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	br.cloop.dptk	.grt1
+	;;
+	xma.l		f38 = f9, f6, f0
+	xma.hu		f39 = f9, f6, f0
+	;;
+	getf.sig	r26 = f38
+	getf.sig	r27 = f39
+	br		.Lcj1
+
+.grt1:	ldf8		f10 = [r33], 8
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	xma.l		f38 = f9, f6, f0
+	xma.hu		f39 = f9, f6, f0
+	;;
+	ldf8		f13 = [r33], 8
+	;;
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.grt5
+
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	br		.Lcj5
+
+.grt5:	ldf8		f10 = [r33], 8
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	ldf8		f11 = [r33], 8
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	ldf8		f12 = [r33], 8
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	br		.LL01
+
+.Lb10:	ldf8		f13 = [r33], 8
+	br.cloop.dptk	.grt2
+	;;
+
+	xma.l		f36 = f9, f6, f0
+	xma.hu		f37 = f9, f6, f0
+	;;
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r24 = f36
+	;;
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	;;
+	getf.sig	r27 = f39
+	br		.Lcj2
+
+.grt2:	ldf8		f10 = [r33], 8
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	xma.l		f36 = f9, f6, f0
+	xma.hu		f37 = f9, f6, f0
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	ldf8		f13 = [r33], 8
+	;;
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.grt6
+
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	br		.Lcj6
+
+.grt6:	getf.sig	r25 = f37
+	ldf8		f10 = [r33], 8
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	ldf8		f11 = [r33], 8
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	br		.LL10
+
+
+.Lb11:	ldf8		f12 = [r33], 8
+	;;
+	ldf8		f13 = [r33], 8
+	br.cloop.dptk	.grt3
+	;;
+
+	xma.l		f34 = f9, f6, f0
+	xma.hu		f35 = f9, f6, f0
+	;;
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	;;
+	getf.sig	r24 = f36
+	;;
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	br		.Lcj3
+
+.grt3:	ldf8		f10 = [r33], 8
+	;;
+	xma.l		f34 = f9, f6, f0
+	xma.hu		f35 = f9, f6, f0
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	ldf8		f13 = [r33], 8
+	;;
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.grt7
+
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	br		.Lcj7
+
+.grt7:	getf.sig	r25 = f37
+	ldf8		f10 = [r33], 8
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	br		.LL11
+
+
+.Lb00:	ldf8		f11 = [r33], 8
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	ldf8		f13 = [r33], 8
+	br.cloop.dptk	.grt4
+	;;
+
+	xma.l		f32 = f9, f6, f0
+	xma.hu		f33 = f9, f6, f0
+	;;
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	;;
+	getf.sig	r24 = f36
+	br		.Lcj4
+
+.grt4:	xma.l		f32 = f9, f6, f0
+	xma.hu		f33 = f9, f6, f0
+	;;
+	ldf8		f10 = [r33], 8
+	;;
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	ldf8		f12 = [r33], 8
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	ldf8		f13 = [r33], 8
+	;;
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.LL00
+	br		.Lcj8
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Ltop:
+	.pred.rel "mutex",p6,p7
+C	.mfi
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+  (p6)	sub		r15 = r19, r27, 1
+C	.mfi
+	st8		[r32] = r19, 8
+	xma.hu		f33 = f10, f6, f0
+  (p7)	sub		r15 = r19, r27
+	;;
+.LL00:
+C	.mfi
+	getf.sig	r25 = f37
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r20
+C	.mib
+	ldf8		f10 = [r33], 8
+	sub		r16 = r15, r20
+	nop.b 0
+	;;
+
+C	.mfi
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+  (p6)	sub		r15 = r16, r21, 1
+C	.mfi
+	st8		[r32] = r16, 8
+	xma.hu		f35 = f11, f6, f0
+  (p7)	sub		r15 = r16, r21
+	;;
+.LL11:
+C	.mfi
+	getf.sig	r27 = f39
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r22
+C	.mib
+	ldf8		f11 = [r33], 8
+	sub		r17 = r15, r22
+	nop.b 0
+	;;
+
+C	.mfi
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+  (p6)	sub		r15 = r17, r23, 1
+C	.mfi
+	st8		[r32] = r17, 8
+	xma.hu		f37 = f12, f6, f0
+  (p7)	sub		r15 = r17, r23
+	;;
+.LL10:
+C	.mfi
+	getf.sig	r21 = f33
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r24
+C	.mib
+	ldf8		f12 = [r33], 8
+	sub		r18 = r15, r24
+	nop.b 0
+	;;
+
+C	.mfi
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+  (p6)	sub		r15 = r18, r25, 1
+C	.mfi
+	st8		[r32] = r18, 8
+	xma.hu		f39 = f13, f6, f0
+  (p7)	sub		r15 = r18, r25
+	;;
+.LL01:
+C	.mfi
+	getf.sig	r23 = f35
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r26
+C	.mib
+	ldf8		f13 = [r33], 8
+	sub		r19 = r15, r26
+	br.cloop.sptk.few .Ltop
+C *** MAIN LOOP END ***
+	;;
+
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+  (p6)	sub		r15 = r19, r27, 1
+	st8		[r32] = r19, 8
+	xma.hu		f33 = f10, f6, f0
+  (p7)	sub		r15 = r19, r27
+	;;
+.Lcj8:	getf.sig	r25 = f37
+	cmp.ltu		p6, p7 = r15, r20
+	sub		r16 = r15, r20
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+  (p6)	sub		r15 = r16, r21, 1
+	st8		[r32] = r16, 8
+	xma.hu		f35 = f11, f6, f0
+  (p7)	sub		r15 = r16, r21
+	;;
+.Lcj7:	getf.sig	r27 = f39
+	cmp.ltu		p6, p7 = r15, r22
+	sub		r17 = r15, r22
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+  (p6)	sub		r15 = r17, r23, 1
+	st8		[r32] = r17, 8
+	xma.hu		f37 = f12, f6, f0
+  (p7)	sub		r15 = r17, r23
+	;;
+.Lcj6:	getf.sig	r21 = f33
+	cmp.ltu		p6, p7 = r15, r24
+	sub		r18 = r15, r24
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+  (p6)	sub		r15 = r18, r25, 1
+	st8		[r32] = r18, 8
+	xma.hu		f39 = f13, f6, f0
+  (p7)	sub		r15 = r18, r25
+	;;
+.Lcj5:	getf.sig	r23 = f35
+	cmp.ltu		p6, p7 = r15, r26
+	sub		r19 = r15, r26
+	;;
+	getf.sig	r24 = f36
+  (p6)	sub		r15 = r19, r27, 1
+	st8		[r32] = r19, 8
+  (p7)	sub		r15 = r19, r27
+	;;
+.Lcj4:	getf.sig	r25 = f37
+	cmp.ltu		p6, p7 = r15, r20
+	sub		r16 = r15, r20
+	;;
+	getf.sig	r26 = f38
+  (p6)	sub		r15 = r16, r21, 1
+	st8		[r32] = r16, 8
+  (p7)	sub		r15 = r16, r21
+	;;
+.Lcj3:	getf.sig	r27 = f39
+	cmp.ltu		p6, p7 = r15, r22
+	sub		r17 = r15, r22
+	;;
+  (p6)	sub		r15 = r17, r23, 1
+	st8		[r32] = r17, 8
+  (p7)	sub		r15 = r17, r23
+	;;
+.Lcj2:	cmp.ltu		p6, p7 = r15, r24
+	sub		r18 = r15, r24
+	;;
+  (p6)	sub		r15 = r18, r25, 1
+	st8		[r32] = r18, 8
+  (p7)	sub		r15 = r18, r25
+	;;
+.Lcj1:	cmp.ltu		p6, p7 = r15, r26
+	sub		r19 = r15, r26
+	;;
+  (p6)	sub		r8 = r19, r27, 1
+	st8		[r32] = r19
+  (p7)	sub		r8 = r19, r27
+	mov ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/cnd_aors_n.asm b/third_party/gmp/mpn/ia64/cnd_aors_n.asm
new file mode 100644
index 0000000..edd0552
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/cnd_aors_n.asm
@@ -0,0 +1,264 @@
+dnl  IA-64 mpn_cnd_add_n/mpn_cnd_sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1.5
+
+C INPUT PARAMETERS
+define(`cnd', `r32')
+define(`rp',  `r33')
+define(`up',  `r34')
+define(`vp',  `r35')
+define(`n',   `r36')
+
+ifdef(`OPERATION_cnd_add_n',`
+  define(ADDSUB,	add)
+  define(CND,		ltu)
+  define(INCR,		1)
+  define(LIM,		-1)
+  define(func,    mpn_cnd_add_n)
+')
+ifdef(`OPERATION_cnd_sub_n',`
+  define(ADDSUB,	sub)
+  define(CND,		gtu)
+  define(INCR,		-1)
+  define(LIM,		0)
+  define(func,    mpn_cnd_sub_n)
+')
+
+define(PFDIST, 160)
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`x0',`r20') define(`x1',`r21') define(`x2',`r22') define(`x3',`r23')
+define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
+define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
+define(`up1',`up') define(`up2',`r8') define(`upadv',`r1')
+define(`vp1',`vp') define(`vp2',`r9') define(`vpadv',`r11')
+define(`rp1',`rp') define(`rp2',`r10')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4	rp = 0, rp		C				M I
+	addp4	up = 0, up		C				M I
+	nop.i	0
+	addp4	vp = 0, vp		C				M I
+	nop.m	0
+	zxt4	n = n			C				I
+	;;
+')
+ {.mmi;	and	r3 = 3, n		C				M I
+	add	n = -1, n		C				M I
+	mov	r2 = ar.lc		C				I0
+}{.mmi;	cmp.ne	p6, p7 = 0, cnd		C				M I
+	add	vp2 = 8, vp		C				M I
+	add	up2 = 8, up		C				M I
+	;;
+}{.mmi;	add	upadv = PFDIST, up	C				M I
+	add	vpadv = PFDIST, vp	C				M I
+	shr.u	n = n, 2		C				I0
+	.pred.rel "mutex", p6, p7
+}{.mmi;	add	rp2 = 8, rp		C				M I
+   (p6)	mov	cnd = -1		C				M I
+   (p7)	mov	cnd = 0			C				M I
+	;;
+}	cmp.eq	p9, p0 = 1, r3		C				M I
+	cmp.eq	p7, p0 = 2, r3		C				M I
+	cmp.eq	p8, p0 = 3, r3		C				M I
+   (p9)	br	L(b1)			C				B
+   (p7)	br	L(b2)			C				B
+   (p8)	br	L(b3)			C				B
+	;;
+L(b0):
+ {.mmi;	ld8	v2 = [vp1], 16		C				M01
+	ld8	v3 = [vp2], 16		C				M01
+	mov	ar.lc = n		C				I0
+	;;
+}	ld8	u2 = [up1], 16		C				M01
+	ld8	u3 = [up2], 16		C				M01
+	and	x2 = v2, cnd		C				M I
+	and	x3 = v3, cnd		C				M I
+	;;
+	ADDSUB	w2 = u2, x2		C				M I
+	ADDSUB	w3 = u3, x3		C				M I
+	;;
+	ld8	v0 = [vp1], 16		C				M01
+	ld8	v1 = [vp2], 16		C				M01
+	cmp.CND	p8, p0 = w2, u2		C				M I
+	cmp.CND	p9, p0 = w3, u3		C				M I
+	br	L(lo0)
+
+L(b1):	ld8	v1 = [vp1], 8		C				M01
+	add	vp2 = 8, vp2		C				M I
+	add	rp2 = 8, rp2		C				M I
+	;;
+	ld8	u1 = [up1], 8		C				M01
+	add	up2 = 8, up2		C				M I
+	and	x1 = v1, cnd		C				M I
+	;;
+	ADDSUB	w1 = u1, x1		C				M I
+	cmp.ne	p10, p0 = 0, n
+	add	n = -1, n
+	;;
+	cmp.CND	p7, p0 = w1, u1		C				M I
+	st8	[rp1] = w1, 8		C				M23
+  (p10)	br	L(b0)
+	;;
+	mov	r8 = 0			C				M I
+	br	L(e1)
+
+L(b3):	ld8	v3 = [vp1], 8		C				M01
+	add	vp2 = 8, vp2		C				M I
+	add	rp2 = 8, rp2		C				M I
+	;;
+	ld8	u3 = [up1], 8		C				M01
+	add	up2 = 8, up2		C				M I
+	and	x3 = v3, cnd		C				M I
+	;;
+	ADDSUB	w3 = u3, x3		C				M I
+	;;
+	cmp.CND	p9, p0 = w3, u3		C				M I
+	st8	[rp1] = w3, 8		C				M23
+	C fall through
+
+L(b2):
+ {.mmi;	ld8	v0 = [vp1], 16		C				M01
+	ld8	v1 = [vp2], 16		C				M01
+	mov	ar.lc = n		C				I0
+	;;
+}	ld8	u0 = [up1], 16		C				M01
+	ld8	u1 = [up2], 16		C				M01
+	and	x0 = v0, cnd		C				M I
+	and	x1 = v1, cnd		C				M I
+	;;
+	ADDSUB	w0 = u0, x0		C				M I
+	ADDSUB	w1 = u1, x1		C				M I
+	br.cloop.dptk	L(gt2)		C				B
+	;;
+	cmp.CND	p6, p0 = w0, u0		C				M I
+	br		L(e2)		C				B
+L(gt2):
+	ld8	v2 = [vp1], 16		C				M01
+	ld8	v3 = [vp2], 16		C				M01
+	cmp.CND	p6, p0 = w0, u0		C				M I
+	cmp.CND	p7, p0 = w1, u1		C				M I
+	br		L(lo2)		C				B
+
+
+C *** MAIN LOOP START ***
+C	ALIGN(32)
+L(top):
+ {.mmi;	ld8	v2 = [vp1], 16		C				M01
+	ld8	v3 = [vp2], 16		C				M01
+	cmp.CND	p6, p0 = w0, u0		C				M I
+}{.mmi;	st8	[rp1] = w2, 16		C				M23
+	st8	[rp2] = w3, 16		C				M23
+	cmp.CND	p7, p0 = w1, u1		C				M I
+	;;
+}
+L(lo2):
+ {.mmi;	ld8	u2 = [up1], 16		C				M01
+	ld8	u3 = [up2], 16		C				M01
+   (p9)	cmpeqor	p6, p0 = LIM, w0	C				M I
+}{.mmi;	and	x2 = v2, cnd		C				M I
+	and	x3 = v3, cnd		C				M I
+   (p9)	add	w0 = INCR, w0		C				M I
+	;;
+}{.mmi;	ADDSUB	w2 = u2, x2		C				M I
+   (p6)	cmpeqor	p7, p0 = LIM, w1	C				M I
+   (p6)	add	w1 = INCR, w1		C				M I
+}{.mmi;	ADDSUB	w3 = u3, x3		C				M I
+	lfetch	[upadv], 32
+	nop	0
+	;;
+}{.mmi;	ld8	v0 = [vp1], 16		C				M01
+	ld8	v1 = [vp2], 16		C				M01
+	cmp.CND	p8, p0 = w2, u2		C				M I
+}{.mmi;	st8	[rp1] = w0, 16		C				M23
+	st8	[rp2] = w1, 16		C				M23
+	cmp.CND	p9, p0 = w3, u3		C				M I
+	;;
+}
+L(lo0):
+ {.mmi;	ld8	u0 = [up1], 16		C				M01
+	ld8	u1 = [up2], 16		C				M01
+   (p7)	cmpeqor	p8, p0 = LIM, w2	C				M I
+}{.mmi;	and	x0 = v0, cnd		C				M I
+	and	x1 = v1, cnd		C				M I
+   (p7)	add	w2 = INCR, w2		C				M I
+	;;
+}{.mmi;	ADDSUB	w0 = u0, x0		C				M I
+   (p8)	cmpeqor	p9, p0 = LIM, w3	C				M I
+   (p8)	add	w3 = INCR, w3		C				M I
+}{.mmb;	ADDSUB	w1 = u1, x1		C				M I
+	lfetch	[vpadv], 32
+	br.cloop.dptk	L(top)		C				B
+	;;
+}
+C *** MAIN LOOP END ***
+
+
+L(end):
+ {.mmi;	st8	[rp1] = w2, 16		C				M23
+	st8	[rp2] = w3, 16		C				M23
+	cmp.CND	p6, p0 = w0, u0		C				M I
+	;;
+}
+L(e2):
+ {.mmi;	cmp.CND	p7, p0 = w1, u1		C				M I
+   (p9)	cmpeqor	p6, p0 = LIM, w0	C				M I
+   (p9)	add	w0 = INCR, w0		C				M I
+	;;
+}{.mmi;	mov	r8 = 0			C				M I
+   (p6)	cmpeqor	p7, p0 = LIM, w1	C				M I
+   (p6)	add	w1 = INCR, w1		C				M I
+	;;
+}{.mmi;	st8	[rp1] = w0, 16		C				M23
+	st8	[rp2] = w1, 16		C				M23
+	mov	ar.lc = r2		C				I0
+}
+L(e1):
+ {.mmb;	nop	0
+   (p7)	mov	r8 = 1			C				M I
+	br.ret.sptk.many b0		C				B
+}
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/copyd.asm b/third_party/gmp/mpn/ia64/copyd.asm
new file mode 100644
index 0000000..b94a1af
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/copyd.asm
@@ -0,0 +1,186 @@
+dnl  IA-64 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    1
+C Itanium 2:  0.5
+
+C INPUT PARAMETERS
+C rp = r32
+C sp = r33
+C n = r34
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	.prologue
+	.save ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	addp4		r33 = 0, r33
+	sxt4		r34 = r34
+	;;
+')
+{.mmi
+	shladd		r32 = r34, 3, r32
+	shladd		r33 = r34, 3, r33
+	mov.i		r2 = ar.lc
+}
+{.mmi
+	and		r14 = 3, r34
+	cmp.ge		p14, p15 = 3, r34
+	add		r34 = -4, r34
+	;;
+}
+{.mmi
+	cmp.eq		p8, p0 = 1, r14
+	cmp.eq		p10, p0 = 2, r14
+	cmp.eq		p12, p0 = 3, r14
+}
+{.bbb
+  (p8)	br.dptk		.Lb01
+  (p10)	br.dptk		.Lb10
+  (p12)	br.dptk		.Lb11
+}
+
+.Lb00:	C  n = 0, 4, 8, 12, ...
+	add		r32 = -8, r32
+	add		r33 = -8, r33
+  (p14)	br.dptk		.Ls00
+	;;
+	add		r21 = -8, r33
+	ld8		r16 = [r33], -16
+	shr		r15 = r34, 2
+	;;
+	ld8		r17 = [r21], -16
+	mov.i		ar.lc = r15
+	ld8		r18 = [r33], -16
+	add		r20 = -8, r32
+	;;
+	ld8		r19 = [r21], -16
+	br.cloop.dptk	.Loop
+	;;
+	br.sptk		.Lend
+	;;
+
+.Lb01:	C  n = 1, 5, 9, 13, ...
+	add		r21 = -8, r33
+	add		r20 = -8, r32
+	add		r33 = -16, r33
+	add		r32 = -16, r32
+	;;
+	ld8		r19 = [r21], -16
+	shr		r15 = r34, 2
+  (p14)	br.dptk		.Ls01
+	;;
+	ld8		r16 = [r33], -16
+	mov.i		ar.lc = r15
+	;;
+	ld8		r17 = [r21], -16
+	ld8		r18 = [r33], -16
+	br.sptk		.Li01
+	;;
+
+.Lb10:	C  n = 2,6, 10, 14, ...
+	add		r21 = -16, r33
+	shr		r15 = r34, 2
+	add		r20 = -16, r32
+	add		r32 = -8, r32
+	add		r33 = -8, r33
+	;;
+	ld8		r18 = [r33], -16
+	ld8		r19 = [r21], -16
+	mov.i		ar.lc = r15
+  (p14)	br.dptk		.Ls10
+	;;
+	ld8		r16 = [r33], -16
+	ld8		r17 = [r21], -16
+	br.sptk		.Li10
+	;;
+
+.Lb11:	C  n = 3, 7, 11, 15, ...
+	add		r21 = -8, r33
+	add		r20 = -8, r32
+	add		r33 = -16, r33
+	add		r32 = -16, r32
+	;;
+	ld8		r17 = [r21], -16
+	shr		r15 = r34, 2
+	;;
+	ld8		r18 = [r33], -16
+	mov.i		ar.lc = r15
+	ld8		r19 = [r21], -16
+  (p14)	br.dptk		.Ls11
+	;;
+	ld8		r16 = [r33], -16
+	br.sptk		.Li11
+	;;
+
+	ALIGN(32)
+.Loop:
+.Li00:
+{.mmb
+	st8		[r32] = r16, -16
+	ld8		r16 = [r33], -16
+	nop.b		0
+}
+.Li11:
+{.mmb
+	st8		[r20] = r17, -16
+	ld8		r17 = [r21], -16
+	nop.b		0
+	;;
+}
+.Li10:
+{.mmb
+	st8		[r32] = r18, -16
+	ld8		r18 = [r33], -16
+	nop.b		0
+}
+.Li01:
+{.mmb
+	st8		[r20] = r19, -16
+	ld8		r19 = [r21], -16
+	br.cloop.dptk	.Loop
+	;;
+}
+.Lend:	st8		[r32] = r16, -16
+.Ls11:	st8		[r20] = r17, -16
+	;;
+.Ls10:	st8		[r32] = r18, -16
+.Ls01:	st8		[r20] = r19, -16
+.Ls00:	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/copyi.asm b/third_party/gmp/mpn/ia64/copyi.asm
new file mode 100644
index 0000000..49ed192
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/copyi.asm
@@ -0,0 +1,182 @@
+dnl  IA-64 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    1
+C Itanium 2:  0.5
+
+C INPUT PARAMETERS
+C rp = r32
+C sp = r33
+C n = r34
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	.prologue
+	.save ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	addp4		r33 = 0, r33
+	sxt4		r34 = r34
+	;;
+')
+{.mmi
+	nop		0
+	nop		0
+	mov.i		r2 = ar.lc
+}
+{.mmi
+	and		r14 = 3, r34
+	cmp.ge		p14, p15 = 3, r34
+	add		r34 = -4, r34
+	;;
+}
+{.mmi
+	cmp.eq		p8, p0 = 1, r14
+	cmp.eq		p10, p0 = 2, r14
+	cmp.eq		p12, p0 = 3, r14
+}
+{.bbb
+  (p8)	br.dptk		.Lb01
+  (p10)	br.dptk		.Lb10
+  (p12)	br.dptk		.Lb11
+}
+
+.Lb00:	C  n = 0, 4, 8, 12, ...
+  (p14)	br.dptk		.Ls00
+	;;
+	add		r21 = 8, r33
+	ld8		r16 = [r33], 16
+	shr		r15 = r34, 2
+	;;
+	ld8		r17 = [r21], 16
+	mov.i		ar.lc = r15
+	ld8		r18 = [r33], 16
+	add		r20 = 8, r32
+	;;
+	ld8		r19 = [r21], 16
+	br.cloop.dptk	.Loop
+	;;
+	br.sptk		.Lend
+	;;
+
+.Lb01:	C  n = 1, 5, 9, 13, ...
+	add		r21 = 0, r33
+	add		r20 = 0, r32
+	add		r33 = 8, r33
+	add		r32 = 8, r32
+	;;
+	ld8		r19 = [r21], 16
+	shr		r15 = r34, 2
+  (p14)	br.dptk		.Ls01
+	;;
+	ld8		r16 = [r33], 16
+	mov.i		ar.lc = r15
+	;;
+	ld8		r17 = [r21], 16
+	ld8		r18 = [r33], 16
+	br.sptk		.Li01
+	;;
+
+.Lb10:	C  n = 2,6, 10, 14, ...
+	add		r21 = 8, r33
+	add		r20 = 8, r32
+	ld8		r18 = [r33], 16
+	shr		r15 = r34, 2
+	;;
+	ld8		r19 = [r21], 16
+	mov.i		ar.lc = r15
+  (p14)	br.dptk		.Ls10
+	;;
+	ld8		r16 = [r33], 16
+	ld8		r17 = [r21], 16
+	br.sptk		.Li10
+	;;
+
+.Lb11:	C  n = 3, 7, 11, 15, ...
+	add		r21 = 0, r33
+	add		r20 = 0, r32
+	add		r33 = 8, r33
+	add		r32 = 8, r32
+	;;
+	ld8		r17 = [r21], 16
+	shr		r15 = r34, 2
+	;;
+	ld8		r18 = [r33], 16
+	mov.i		ar.lc = r15
+	ld8		r19 = [r21], 16
+  (p14)	br.dptk		.Ls11
+	;;
+	ld8		r16 = [r33], 16
+	br.sptk		.Li11
+	;;
+
+	ALIGN(32)
+.Loop:
+.Li00:
+{.mmb
+	st8		[r32] = r16, 16
+	ld8		r16 = [r33], 16
+	nop.b		0
+}
+.Li11:
+{.mmb
+	st8		[r20] = r17, 16
+	ld8		r17 = [r21], 16
+	nop.b		0
+	;;
+}
+.Li10:
+{.mmb
+	st8		[r32] = r18, 16
+	ld8		r18 = [r33], 16
+	nop.b		0
+}
+.Li01:
+{.mmb
+	st8		[r20] = r19, 16
+	ld8		r19 = [r21], 16
+	br.cloop.dptk	.Loop
+	;;
+}
+.Lend:	st8		[r32] = r16, 16
+.Ls11:	st8		[r20] = r17, 16
+	;;
+.Ls10:	st8		[r32] = r18, 16
+.Ls01:	st8		[r20] = r19, 16
+.Ls00:	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/dive_1.asm b/third_party/gmp/mpn/ia64/dive_1.asm
new file mode 100644
index 0000000..5e4a273
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/dive_1.asm
@@ -0,0 +1,236 @@
+dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
+
+dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C Itanium:      16
+C Itanium 2:     8
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`divisor', `r35')
+
+define(`lshift', `r24')
+define(`rshift', `r25')
+
+C This code is a bit messy, and not as similar to mode1o.asm as desired.
+
+C The critical path during initialization is for computing the inverse of the
+C divisor.  Since odd divisors are probably common, we conditionally execute
+C the initial count_trailing_zeros code and the downshift.
+
+C Possible improvement: Merge more of the feed-in code into the inverse
+C computation.
+
+ASM_START()
+	.text
+	.align	32
+.Ltab:
+data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
+data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
+data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
+data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
+data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
+data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
+data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
+data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
+data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
+data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
+data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
+data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
+data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
+data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
+data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
+data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
+
+
+PROLOGUE(mpn_divexact_1)
+	.prologue
+	.save		ar.lc, r2
+	.body
+
+ {.mmi;	add		r8 = -1, divisor	C M0
+	nop		0			C M1
+	tbit.z		p8, p9 = divisor, 0	C I0
+}
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M2  rp extend
+	addp4		up = 0, up		C M3  up extend
+	sxt4		n = n')			C I1  size extend
+	;;
+.Lhere:
+ {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
+  (p8)	andcm		r8 = r8, divisor	C M1
+	mov		r15 = ip		C I0  .Lhere
+	;;
+}{.mii
+	.pred.rel "mutex", p8, p9
+  (p9)	mov		rshift = 0		C M0
+  (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
+	cmp.eq		p6, p10 = 1, n		C I1
+	;;
+}{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
+  (p8)	shr.u		divisor = divisor, rshift C I0
+	nop		0			C I1
+	;;
+}{.mmi;	add		n = -4, n		C M0  size-1
+  (p10)	ld8		r21 = [up], 8		C M1  up[1]
+	mov		r14 = 2			C M1  2
+}{.mfi;	setf.sig	f6 = divisor		C M2  divisor
+	mov		f9 = f0			C M3  carry		FIXME
+	zxt1		r3 = divisor		C I1  divisor low byte
+	;;
+}{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
+	sub		r16 = 0, divisor	C M1  -divisor
+	mov		r2 = ar.lc		C I0
+}{.mmi;	sub		lshift = 64, rshift	C M2
+	setf.sig	f13 = r14		C M3  2 in significand
+	mov		r17 = -1		C I1  -1
+	;;
+}{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
+	nop		0			C M1
+	mov		ar.lc = n		C I0  size-1 loop count
+}{.mmi;	setf.sig	f12 = r16		C M2  -divisor
+	setf.sig	f8 = r17		C M3  -1
+	cmp.eq		p7, p0 = -2, n		C I1
+	;;
+}{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
+	cmp.eq		p8, p0 = -1, n		C M0
+	shr.u		r23 = r20, rshift	C I0
+	;;
+}
+
+	C f6	divisor
+	C f7	inverse, being calculated
+	C f8	-1, will be -inverse
+	C f9	carry
+	C f12	-divisor
+	C f13	2
+	C f14	scratch
+
+	xmpy.l		f14 = f13, f7		C Newton 2*i
+	xmpy.l		f7 = f7, f7		C Newton i*i
+	;;
+	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
+	;;
+	setf.sig	f10 = r23		C speculative, used iff n = 1
+	xmpy.l		f14 = f13, f7		C Newton 2*i
+	shl		r22 = r21, lshift	C speculative, used iff n > 1
+	xmpy.l		f7 = f7, f7		C Newton i*i
+	;;
+	or		r31 = r22, r23		C speculative, used iff n > 1
+	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
+	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
+	;;
+	setf.sig	f11 = r31		C speculative, used iff n > 1
+	xmpy.l		f14 = f13, f7		C Newton 2*i
+	xmpy.l		f7 = f7, f7		C Newton i*i
+	;;
+	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
+
+  (p7)	br.cond.dptk	.Ln2
+  (p10)	br.cond.dptk	.grt3
+	;;
+
+.Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
+	br		.Lx1
+
+.Ln2:
+	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
+	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
+	setf.sig	f11 = r23
+	br		.Lx2
+
+.grt3:
+	ld8		r21 = [up], 8		C up[2]
+	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
+	;;
+	shl		r22 = r21, lshift
+	;;
+	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
+	;;
+	or		r31 = r22, r23
+	shr.u		r23 = r21, rshift
+	;;
+	setf.sig	f11 = r31
+  (p8)	br.cond.dptk	.Lx3			C branch for n = 3
+	;;
+	ld8		r21 = [up], 8
+	br		.Lent
+
+.Ltop:	ld8		r21 = [up], 8
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	nop.b		0
+	;;
+.Lent:	add		r16 = 160, up
+	shl		r22 = r21, lshift
+	nop.b		0
+	;;
+	stf8		[rp] = f12, 8
+	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
+	nop.b		0
+	nop.m		0
+	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
+	nop.b		0
+	;;
+	or		r31 = r22, r23
+	shr.u		r23 = r21, rshift
+	nop.b		0
+	;;
+	lfetch		[r16]
+	setf.sig	f11 = r31
+	br.cloop.sptk.few.clr .Ltop
+
+
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	;;
+.Lx3:	stf8		[rp] = f12, 8
+	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
+	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
+	;;
+	setf.sig	f11 = r23
+	;;
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	;;
+.Lx2:	stf8		[rp] = f12, 8
+	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
+	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
+	;;
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	;;
+.Lx1:	stf8		[rp] = f12, 8
+	mov		ar.lc = r2		C I0
+	br.ret.sptk.many b0
+EPILOGUE()
diff --git a/third_party/gmp/mpn/ia64/divrem_1.asm b/third_party/gmp/mpn/ia64/divrem_1.asm
new file mode 100644
index 0000000..e887820
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/divrem_1.asm
@@ -0,0 +1,477 @@
+dnl  IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
+dnl  unnormalized limb.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C Itanium:    40-42
+C Itanium 2:  29-30
+
+C This was generated by gcc, then the loops were optimized.  The preinv entry
+C point was shoehorned into the file.  Lots of things outside the loops could
+C be streamlined.  It would probably be a good idea to merge the loops for
+C normalized and unnormalized divisor, since the shifting stuff is done for
+C free in parallel with other operations.  It would even be possible to merge
+C all loops, if the ld8 were made conditional.
+
+C TODO
+C  * Consider delaying inversion for normalized mpn_divrem_1 entry till after
+C    computing leading limb.
+C  * Inline and interleave limb inversion code with loop setup code.
+
+ASM_START()
+
+C HP's assembler requires these declarations for importing mpn_invert_limb
+	.global	mpn_invert_limb
+	.type	mpn_invert_limb,@function
+
+C INPUT PARAMETERS
+C rp    = r32
+C qxn   = r33
+C up    = r34
+C n     = r35
+C vl    = r36
+C vlinv = r37  (preinv only)
+C cnt = r38    (preinv only)
+
+PROLOGUE(mpn_preinv_divrem_1)
+	.prologue
+	.save	ar.pfs, r42
+	alloc		r42 = ar.pfs, 7, 8, 1, 0
+	.save	ar.lc, r44
+	mov		r44 = ar.lc
+	.save	rp, r41
+	mov		r41 = b0
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	sxt4		r33 = r33
+	addp4		r34 = 0, r34
+	sxt4		r35 = r35
+	;;
+')
+	mov		r40 = r38
+	shladd		r34 = r35, 3, r34
+	;;
+	adds		r34 = -8, r34
+	;;
+	ld8		r39 = [r34], -8
+	;;
+
+	add		r15 = r35, r33
+	;;
+	mov		r8 = r37
+	shladd		r32 = r15, 3, r32	C r32 = rp + n + qxn
+	cmp.le		p8, p0 = 0, r36
+	;;
+	adds		r32 = -8, r32		C r32 = rp + n + qxn - 1
+	cmp.leu		p6, p7 = r36, r39
+   (p8)	br.cond.dpnt	.Lpunnorm
+	;;
+
+   (p6)	addl		r15 = 1, r0
+   (p7)	mov		r15 = r0
+	;;
+   (p6)	sub		r38 = r39, r36
+   (p7)	mov		r38 = r39
+	st8		[r32] = r15, -8
+	adds		r35 = -2, r35		C un -= 2
+	br	.Lpn
+
+.Lpunnorm:
+   (p6)	add		r34 = 8, r34
+	mov		r38 = 0			C r = 0
+	shl		r36 = r36, r40
+   (p6)	br.cond.dptk	.Lpu
+	;;
+	shl		r38 = r39, r40		C r = ahigh << cnt
+	cmp.ne		p8, p0 = 1, r35
+	st8		[r32] = r0, -8
+	adds		r35 = -1, r35		C un--
+   (p8)	br.cond.dpnt	.Lpu
+
+	mov		r23 = 1
+	;;
+	setf.sig	f6 = r8
+	setf.sig	f12 = r23
+	br		.L435
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+	.prologue
+	.save	ar.pfs, r42
+	alloc		r42 = ar.pfs, 5, 8, 1, 0
+	.save	ar.lc, r44
+	mov		r44 = ar.lc
+	.save	rp, r41
+	mov		r41 = b0
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	sxt4		r33 = r33
+	addp4		r34 = 0, r34
+	sxt4		r35 = r35
+	;;
+')
+	mov		r38 = r0
+	add		r15 = r35, r33
+	;;
+	cmp.ne		p6, p7 = 0, r15
+	;;
+   (p7)	mov		r8 = r0
+   (p7)	br.cond.dpnt	.Lret
+	shladd		r14 = r15, 3, r32	C r14 = rp + n + qxn
+	cmp.le		p6, p7 = 0, r36
+	;;
+	adds		r32 = -8, r14		C r32 = rp + n + qxn - 1
+   (p6)	br.cond.dpnt	.Lunnorm
+	cmp.eq		p6, p7 = 0, r35
+   (p6)	br.cond.dpnt	.L179
+	shladd		r14 = r35, 3, r34
+	;;
+	adds		r14 = -8, r14
+	adds		r35 = -1, r35
+	;;
+	ld8		r38 = [r14]
+	;;
+	cmp.leu		p6, p7 = r36, r38
+	;;
+   (p6)	addl		r15 = 1, r0
+   (p7)	mov		r15 = r0
+	;;
+	st8		[r32] = r15, -8
+  (p6)	sub		r38 = r38, r36
+
+.L179:
+	mov		r45 = r36
+	adds		r35 = -1, r35
+	br.call.sptk.many b0 = mpn_invert_limb
+	;;
+	shladd		r34 = r35, 3, r34
+.Lpn:
+	mov		r23 = 1
+	;;
+	setf.sig	f6 = r8
+	setf.sig	f12 = r23
+	cmp.le		p6, p7 = 0, r35
+	mov		r40 = 0
+   (p7)	br.cond.dpnt	.L435
+	setf.sig	f10 = r36
+	mov		ar.lc = r35
+	setf.sig	f7 = r38
+	;;
+	sub		r28 = -1, r36
+C Develop quotient limbs for normalized divisor
+.Loop1:		C 00				C q=r18 nh=r38/f7
+	ld8		r20 = [r34], -8
+	xma.hu		f11 = f7, f6, f0
+	;;	C 04
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;	C 08
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	xma.l		f8 = f8, f10, f0
+	;;	C 12
+	getf.sig	r16 = f9
+		C 13
+	getf.sig	r15 = f8
+	;;	C 18
+	cmp.ltu		p6, p7 = r20, r15
+	sub		r15 = r20, r15
+	sub		r16 = r38, r16
+	;;	C 19
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;	C 20
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;	C 21
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	cmp.ltu		p6, p7 = r15, r36	C speculative
+	sub		r28 = r15, r36		C speculative, just for cmp
+	;;	C 22
+   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
+   (p8)	mov		r15 = r28
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;	C 23
+   (p6)	setf.sig	f7 = r15
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;	C 24
+   (p7)	setf.sig	f7 = r15
+	st8		[r32] = r18, -8
+	mov		r38 = r15
+	br.cloop.dptk	.Loop1
+		C 29/30
+	br.sptk		.L435
+	;;
+.Lunnorm:
+	mux1		r16 = r36, @rev
+	cmp.eq		p6, p7 = 0, r35
+   (p6)	br.cond.dpnt	.L322
+	shladd		r34 = r35, 3, r34
+	;;
+	adds		r34 = -8, r34
+	;;
+	ld8		r39 = [r34]
+	;;
+	cmp.leu		p6, p7 = r36, r39
+   (p6)	br.cond.dptk	.L322
+	adds		r34 = -8, r34
+	;;
+	mov		r38 = r39
+	;;
+	cmp.ne		p6, p7 = 1, r15
+	st8		[r32] = r0, -8
+	;;
+   (p7)	mov		r8 = r38
+   (p7)	br.cond.dpnt	.Lret
+	adds		r35 = -1, r35
+.L322:
+	sub		r14 = r0, r16
+	;;
+	or		r14 = r16, r14
+	;;
+	mov		r16 = -8
+	czx1.l		r14 = r14
+	;;
+	shladd		r16 = r14, 3, r16
+	;;
+	shr.u		r14 = r36, r16
+	;;
+	cmp.geu		p6, p7 = 15, r14
+	;;
+   (p7)	shr.u		r14 = r14, 4
+   (p7)	adds		r16 = 4, r16
+	;;
+	cmp.geu		p6, p7 = 3, r14
+	;;
+   (p7)	shr.u		r14 = r14, 2
+   (p7)	adds		r16 = 2, r16
+	;;
+	tbit.nz		p6, p7 = r14, 1
+	;;
+	.pred.rel "mutex",p6,p7
+  (p6)	sub		r40 = 62, r16
+  (p7)	sub		r40 = 63, r16
+	;;
+	shl		r45 = r36, r40
+	shl		r36 = r36, r40
+	shl		r38 = r38, r40
+	br.call.sptk.many b0 = mpn_invert_limb
+	;;
+.Lpu:
+	mov		r23 = 1
+	;;
+	setf.sig	f6 = r8
+	setf.sig	f12 = r23
+	cmp.eq		p6, p7 = 0, r35
+   (p6)	br.cond.dpnt	.L435
+	sub		r16 = 64, r40
+	adds		r35 = -2, r35
+	;;
+	ld8		r39 = [r34], -8
+	cmp.le		p6, p7 = 0, r35
+	;;
+	shr.u		r14 = r39, r16
+	;;
+	or		r38 = r14, r38
+   (p7)	br.cond.dpnt	.Lend3
+	;;
+	mov		r22 = r16
+	setf.sig	f10 = r36
+	setf.sig	f7 = r38
+	mov		ar.lc = r35
+	;;
+C Develop quotient limbs for unnormalized divisor
+.Loop3:
+	ld8		r14 = [r34], -8
+	xma.hu		f11 = f7, f6, f0
+	;;
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	shl		r20 = r39, r40
+	xma.l		f8 = f8, f10, f0
+	shr.u		r24 = r14, r22
+	;;
+	getf.sig	r16 = f9
+	getf.sig	r15 = f8
+	or		r20 = r24, r20
+	;;
+	cmp.ltu		p6, p7 = r20, r15
+	sub		r15 = r20, r15
+	sub		r16 = r38, r16
+	;;
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	cmp.ltu		p6, p7 = r15, r36	C speculative
+	sub		r28 = r15, r36		C speculative, just for cmp
+	;;
+   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
+   (p8)	mov		r15 = r28
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p6)	setf.sig	f7 = r15
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p7)	setf.sig	f7 = r15
+	st8		[r32] = r18, -8
+	mov		r39 = r14
+	mov		r38 = r15
+	br.cloop.dptk	.Loop3
+	;;
+.Lend3:
+	setf.sig	f10 = r36
+	setf.sig	f7 = r38
+	;;
+	xma.hu		f11 = f7, f6, f0
+	;;
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	shl		r20 = r39, r40
+	xma.l		f8 = f8, f10, f0
+	;;
+	getf.sig	r16 = f9
+	getf.sig	r15 = f8
+	;;
+	cmp.ltu		p6, p7 = r20, r15
+	sub		r15 = r20, r15
+	sub		r16 = r38, r16
+	;;
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	;;
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	cmp.ltu		p6, p7 = r15, r36
+	;;
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	st8		[r32] = r18, -8
+	mov		r38 = r15
+.L435:
+	adds		r35 = -1, r33
+	cmp.le		p6, p7 = 1, r33
+   (p7)	br.cond.dpnt	.Lend4
+	;;
+	setf.sig	f7 = r38
+	setf.sig	f10 = r36
+	mov		ar.lc = r35
+	;;
+.Loop4:
+	xma.hu		f11 = f7, f6, f0
+	;;
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	xma.l		f8 = f8, f10, f0
+	;;
+	getf.sig	r16 = f9
+	getf.sig	r15 = f8
+	;;
+	cmp.ltu		p6, p7 = 0, r15
+	sub		r15 = 0, r15
+	sub		r16 = r38, r16
+	;;
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	cmp.ltu		p6, p7 = r15, r36	C speculative
+	sub		r28 = r15, r36		C speculative, just for cmp
+	;;
+   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
+   (p8)	mov		r15 = r28
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p6)	setf.sig	f7 = r15
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p7)	setf.sig	f7 = r15
+	st8		[r32] = r18, -8
+	mov		r38 = r15
+	br.cloop.dptk	.Loop4
+	;;
+.Lend4:
+	shr.u		r8 = r38, r40
+.Lret:
+	mov		ar.pfs = r42
+	mov		ar.lc = r44
+	mov		b0 = r41
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/divrem_2.asm b/third_party/gmp/mpn/ia64/divrem_2.asm
new file mode 100644
index 0000000..9864311
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/divrem_2.asm
@@ -0,0 +1,280 @@
+dnl  IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               norm   frac
+C itanium 1
+C itanium 2     29     29
+
+
+C TODO
+C  * Inline and interleave limb inversion code with loop setup code.
+C  * We should use explicit bundling in much of the code, since it typically
+C    cuts some cycles with the GNU assembler.
+
+
+ASM_START()
+
+C HP's assembler requires these declarations for importing mpn_invert_limb
+	.global	mpn_invert_limb
+	.type	mpn_invert_limb,@function
+
+C INPUT PARAMETERS
+C qp   = r32
+C fn   = r33
+C np   = r34
+C nn   = r35
+C dp   = r36
+
+define(`f0x1', `f15')
+
+ASM_START()
+PROLOGUE(mpn_divrem_2)
+	.prologue
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32		C M I
+	addp4		r34 = 0, r34		C M I
+	zxt4		r35 = r35		C I
+	addp4		r36 = 0, r36		C M I
+	nop.m		0
+	zxt4		r33 = r33		C I
+	;;
+')
+	.save ar.pfs, r42
+	alloc	 r42 = ar.pfs, 5, 9, 1, 0
+	shladd	 r34 = r35, 3, r34
+	adds	 r14 = 8, r36
+	mov	 r43 = r1
+	;;
+	adds	 r15 = -8, r34
+	ld8	 r39 = [r14]
+	.save ar.lc, r45
+	mov	 r45 = ar.lc
+	adds	 r14 = -16, r34
+	mov	 r40 = r0
+	adds	 r34 = -24, r34
+	;;
+	ld8	 r38 = [r15]
+	.save rp, r41
+	mov	 r41 = b0
+	.body
+	ld8	 r36 = [r36]
+	ld8	 r37 = [r14]
+	;;
+	cmp.gtu	 p6, p7 = r39, r38
+  (p6)	br.cond.dptk .L8
+	;;
+	cmp.leu	 p8, p9 = r36, r37
+	cmp.geu	 p6, p7 = r39, r38
+	;;
+  (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
+  (p7)	br.cond.dptk .L51
+.L8:
+	add	 r14 = r33, r35		// un + fn
+	mov	 r46 = r39		// argument to mpn_invert_limb
+	;;
+	adds	 r35 = -3, r14
+	;;
+	cmp.gt	 p12, p0 = r0, r35
+  (p12)	br.cond.dpnt L(end)
+	br.call.sptk.many b0 = mpn_invert_limb
+	;;
+	setf.sig f11 = r8		// di (non-final)
+	setf.sig f34 = r39		// d1
+	setf.sig f33 = r36		// d0
+	mov	 r1 = r43
+	;;
+	mov	 r17 = 1
+	setf.sig f9 = r38		// n2
+	xma.l	 f6 = f11, f34, f0	// t0 = LO(di * d1)
+	;;
+	setf.sig f10 = r37		// n1
+	setf.sig f15 = r17		// 1
+	xma.hu	 f8 = f11, f33, f0	// s0 = HI(di * d0)
+	;;
+	getf.sig r17 = f6
+	getf.sig r16 = f8
+	mov	 ar.lc = r35
+	;;
+	sub	 r18 = r0, r39		// -d1
+	add	 r14 = r17, r36
+	;;
+	setf.sig f14 = r18		// -d1
+	cmp.leu	 p8, p9 = r17, r14
+	add	 r16 = r14, r16
+	;;
+  (p9)	adds	 r19 = 0, r0
+  (p8)	adds	 r19 = -1, r0
+	cmp.gtu	 p6, p7 = r14, r16
+	;;
+  (p6)	adds	 r19 = 1, r19
+	;;
+ifelse(1,1,`
+	cmp.gt	 p7, p6 = r0, r19
+	;;
+  (p6)	adds	 r8 = -1, r8		// di--
+  (p6)	sub	 r14 = r16, r39		// t0 -= d1
+  (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
+	;;
+  (p6)	cmp.gt	 p9, p8 = 1, r19
+  (p7)	cmp.gt	 p9, p8 = 0, r19
+  (p6)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+	;;
+  (p8)	adds	 r8 = -1, r8		// di--
+  (p8)	sub	 r14 = r16, r39		// t0 -= d1
+  (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
+	;;
+  (p8)	cmp.gt	 p7, p6 = 1, r19
+  (p9)	cmp.gt	 p7, p6 = 0, r19
+  (p8)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+	;;
+  (p6)	adds	 r8 = -1, r8		// di--
+  (p6)	sub	 r14 = r16, r39		// t0 -= d1
+  (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
+	;;
+  (p6)	cmp.gt	 p9, p8 = 1, r19
+  (p7)	cmp.gt	 p9, p8 = 0, r19
+  (p6)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+	;;
+  (p8)	adds	 r8 = -1, r8		// di--
+  (p8)	sub	 r14 = r16, r39		// t0 -= d1
+  (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
+	;;
+  (p8)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+',`
+	cmp.gt	 p8, p9 = r0, r19
+  (p8)	br.cond.dpnt .L46
+.L52:
+	cmp.leu	 p6, p7 = r39, r16
+	sub	 r14 = r16, r39
+	adds	 r8 = -1, r8
+	;;
+  (p7)	adds	 r19 = -1, r19
+	mov	 r16 = r14
+	;;
+  (p7)	cmp.gt	 p8, p9 = r0, r19
+  (p9)	br.cond.dptk .L52
+.L46:
+')
+	setf.sig f32 = r8		// di
+	shladd	 r32 = r35, 3, r32
+	;;
+
+	ALIGN(16)
+L(top):	nop 0
+	nop 0
+	cmp.gt	 p8, p9 = r33, r35
+	;;
+ (p8)	mov	 r37 = r0
+ (p9)	ld8	 r37 = [r34], -8
+	xma.hu	 f8 = f9, f32, f10	//				0,29
+	xma.l	 f12 = f9, f32, f10	//				0
+	;;
+	getf.sig r20 = f12		// q0				4
+	xma.l	 f13 = f15, f8, f9	// q += n2			4
+	sub	 r8 = -1, r36		// bitnot d0
+	;;
+	getf.sig r18 = f13		//				8
+	xma.l	 f7 = f14, f13, f10	//				8
+	xma.l	 f6 = f33, f13, f33	// t0 = LO(d0*q+d0)		8
+	xma.hu	 f9 = f33, f13, f33	// t1 = HI(d0*q+d0)		9
+	;;
+	getf.sig r38 = f7		// n1				12
+	getf.sig r16 = f6		//				13
+	getf.sig r19 = f9		//				14
+	;;
+	sub	 r38 = r38, r39		// n1 -= d1			17
+	;;
+	cmp.ne	 p9, p0 = r0, r0	// clear p9
+	cmp.leu	 p10, p11 = r16, r37	// cy for: n0 - t0		18
+	;;
+	sub	 r37 = r37, r16		// n0 -= t0			19
+  (p11)	sub	 r38 = r38, r19, 1	// n1 -= t1 - cy		19
+  (p10)	sub	 r38 = r38, r19		// n1 -= t1			19
+	;;
+	cmp.gtu	 p6, p7 = r20, r38	// n1 >= q0			20
+	;;
+  (p7)	cmp.ltu	 p9, p0 = r8, r37	//				21
+  (p6)	add	 r18 = 1, r18		//
+  (p7)	add	 r37 = r37, r36		//				21
+  (p7)	add	 r38 = r38, r39		//				21
+	;;
+	setf.sig f10 = r37		// n1				22
+  (p9)	add	 r38 = 1, r38		//				22
+	;;
+	setf.sig f9 = r38		// n2				23
+	cmp.gtu	 p6, p7 = r39, r38	//				23
+  (p7)	br.cond.spnt L(fix)
+L(bck):	st8	 [r32] = r18, -8
+	adds	 r35 = -1, r35
+	br.cloop.sptk.few L(top)
+	;;
+
+L(end):	add	r14 = 8, r34
+	add	r15 = 16, r34
+	mov	 b0 = r41
+	;;
+	st8	[r14] = r37
+	st8	[r15] = r38
+	mov	 ar.pfs = r42
+	mov	 r8 = r40
+	mov	 ar.lc = r45
+	br.ret.sptk.many b0
+	;;
+.L51:
+	.pred.rel "mutex", p8, p9
+	sub	 r37 = r37, r36
+  (p9)	sub	 r38 = r38, r39, 1
+  (p8)	sub	 r38 = r38, r39
+	adds	 r40 = 1, r0
+	br .L8
+	;;
+
+L(fix):	cmp.geu	 p6, p7 = r39, r38
+	cmp.leu	 p8, p9 = r36, r37
+	;;
+  (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
+  (p6)	br.cond.dptk L(bck)
+	sub	 r37 = r37, r36
+  (p9)	sub	 r38 = r38, r39, 1
+  (p8)	sub	 r38 = r38, r39
+	adds	 r18 = 1, r18
+	;;
+	setf.sig f9 = r38		// n2
+	setf.sig f10 = r37		// n1
+	br	 L(bck)
+
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/gcd_11.asm b/third_party/gmp/mpn/ia64/gcd_11.asm
new file mode 100644
index 0000000..6137227
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/gcd_11.asm
@@ -0,0 +1,110 @@
+dnl  Itanium-2 mpn_gcd_11
+
+dnl  Copyright 2002-2005, 2012, 2013, 2015, 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C           cycles/bitpair (1x1 gcd)
+C Itanium:       ?
+C Itanium 2:     4.5
+
+
+ASM_START()
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+	.rodata
+	ALIGN(m4_lshift(1,MAXSHIFT))	C align table to allow using dep
+ctz_table:
+	data1	MAXSHIFT
+forloop(i,1,MASK,
+`	data1	m4_count_trailing_zeros(i)-1
+')
+
+define(`x0', r32)
+define(`y0', r33)
+
+PROLOGUE(mpn_gcd_11)
+	.prologue
+	.body
+		addl	r22 = @ltoff(ctz_table), r1
+	;;
+		ld8	r22 = [r22]
+		br	L(ent)
+	;;
+
+	ALIGN(32)
+L(top):
+	.pred.rel "mutex", p6,p7
+ {.mmi;	(p7)	mov	y0 = x0
+	(p6)	sub	x0 = x0, y0
+		dep	r21 = r19, r22, 0, MAXSHIFT	C concat(table,lowbits)
+}{.mmi;		and	r20 = MASK, r19
+	(p7)	mov	x0 = r19
+		and	r23 = 6, r19
+	;;
+}{.mmi;		cmp.eq	p6,p0 = 4, r23
+		cmp.eq	p7,p0 = 0, r23
+		shr.u	x0 = x0, 1		C shift-by-1, always OK
+}{.mmb;		ld1	r16 = [r21]
+		cmp.eq	p10,p0 = 0, r20
+	(p10)	br.spnt.few.clr	 L(count_better)
+	;;
+}
+L(bck):
+	.pred.rel "mutex", p6,p7
+ {.mii;		nop	0
+	(p6)	shr.u	x0 = x0, 1		C u was ...100 before shift-by-1 above
+	(p7)	shr.u	x0 = x0, r16		C u was ...000 before shift-by-1 above
+	;;
+}
+L(ent):
+ {.mmi;		sub	r19 = y0, x0
+		cmp.gtu	p6,p7 = x0, y0
+		cmp.ne	p8,p0 = x0, y0
+}{.mmb;		nop	0
+		nop	0
+	(p8)	br.sptk.few.clr L(top)
+}
+
+L(end):		mov	r8 = y0
+		br.ret.sptk.many b0
+
+L(count_better):
+		add	r20 = -1, x0
+	;;
+		andcm	r23 = r20, x0
+	;;
+		popcnt	r16 = r23
+		br	L(bck)
+EPILOGUE()
diff --git a/third_party/gmp/mpn/ia64/gmp-mparam.h b/third_party/gmp/mpn/ia64/gmp-mparam.h
new file mode 100644
index 0000000..34d2bf3
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/gmp-mparam.h
@@ -0,0 +1,212 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 900MHz Itanium2 (olympic.gmplib.org) */
+/* FFT tuning limit = 59,194,709 */
+/* Generated by tuneup.c, 2019-10-13, gcc 4.2 */
+
+#define MOD_1_1P_METHOD                      2  /* 17.40% faster than 1 */
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 1.35% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              10
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define DIV_1_VS_MUL_1_PERCENT             316
+
+#define MUL_TOOM22_THRESHOLD                47
+#define MUL_TOOM33_THRESHOLD                89
+#define MUL_TOOM44_THRESHOLD               220
+#define MUL_TOOM6H_THRESHOLD               327
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     143
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     153
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     226
+
+#define SQR_BASECASE_THRESHOLD              11
+#define SQR_TOOM2_THRESHOLD                 98
+#define SQR_TOOM3_THRESHOLD                135
+#define SQR_TOOM4_THRESHOLD                272
+#define SQR_TOOM6_THRESHOLD                354
+#define SQR_TOOM8_THRESHOLD                490
+
+#define MULMID_TOOM42_THRESHOLD             99
+
+#define MULMOD_BNM1_THRESHOLD               23
+#define SQRMOD_BNM1_THRESHOLD               27
+
+#define MUL_FFT_MODF_THRESHOLD             840  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    840, 5}, {     30, 6}, {     16, 5}, {     33, 6}, \
+    {     17, 5}, {     36, 6}, {     35, 7}, {     18, 6}, \
+    {     37, 7}, {     19, 6}, {     42, 7}, {     37, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     27, 8}, \
+    {     57, 9}, {     31, 8}, {     63, 9}, {     35, 8}, \
+    {     71, 9}, {     43,10}, {     23, 9}, {     55,10}, \
+    {     31, 9}, {     71,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     99,10}, {     55,11}, {     31,10}, \
+    {     87,11}, {     47,10}, {    111,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
+    {     95,10}, {    191,11}, {    111,12}, {     63,11}, \
+    {    143,10}, {    287,11}, {    159,12}, {     95,11}, \
+    {    207,13}, {     63,12}, {    127,11}, {    271,12}, \
+    {    159,11}, {    335,10}, {    671,12}, {    191,10}, \
+    {    799,12}, {    223,13}, {    127,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    671,13}, {    191,12}, \
+    {    383,11}, {    799,10}, {   1599,12}, {    415,11}, \
+    {    863,14}, {    127,13}, {    255,12}, {    543,11}, \
+    {   1119,12}, {    607,13}, {    319,12}, {    735,11}, \
+    {   1471,12}, {    863,13}, {    447,12}, {    927,11}, \
+    {   1855,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1055,11}, {   2111,12}, {   1119,13}, {    575,12}, \
+    {   1247,13}, {    639,12}, {   1311,13}, {    703,12}, \
+    {   1471,13}, {    831,12}, {   1727,13}, {    895,12}, \
+    {   1791,13}, {    959,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2239,13}, {   1215,14}, {    639,13}, \
+    {   1471,14}, {    767,13}, {   1727,14}, {    895,13}, \
+    {   1855,12}, {   3711,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2111,12}, {   4223,13}, {   2175,14}, \
+    {   1151,13}, {   2495,14}, {   1279,13}, {   2623,14}, \
+    {   1407,15}, {    767,14}, {   1663,13}, {   3455,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2175,13}, \
+    {   4479,14}, {   2431,15}, {   1279,14}, {   2943,15}, \
+    {   1535,14}, {   3455,15}, {   1791,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 149
+#define MUL_FFT_THRESHOLD                 8576
+
+#define SQR_FFT_MODF_THRESHOLD             765  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    765, 5}, {     36, 6}, {     37, 7}, {     19, 6}, \
+    {     42, 7}, {     43, 8}, {     37, 9}, {     19, 8}, \
+    {     43, 9}, {     23, 8}, {     49, 9}, {     27, 8}, \
+    {     57, 9}, {     43,10}, {     23, 9}, {     55,10}, \
+    {     31, 9}, {     71,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     99,10}, {     55,11}, {     31,10}, \
+    {     87,11}, {     47,10}, {    111,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    175,11}, \
+    {     95,10}, {    199,11}, {    111,12}, {     63,11}, \
+    {    159,12}, {     95,11}, {    191,10}, {    399,11}, \
+    {    207,13}, {     63,12}, {    127,10}, {    511, 9}, \
+    {   1023,10}, {    527,11}, {    271,12}, {    159,10}, \
+    {    703,12}, {    191,11}, {    399,10}, {    799,11}, \
+    {    431,12}, {    223,13}, {    127,12}, {    255,11}, \
+    {    527,10}, {   1055,11}, {    559,12}, {    287,11}, \
+    {    607,10}, {   1215,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    799,12}, {    415,11}, {    863,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1055,12}, {    543,11}, {   1119,12}, {    607,11}, \
+    {   1215,12}, {    735,13}, {    383,12}, {    799,11}, \
+    {   1599,12}, {    863,13}, {    447,12}, {    991,14}, \
+    {    255,13}, {    511,12}, {   1055,11}, {   2111,12}, \
+    {   1119,13}, {    575,12}, {   1215,13}, {    639,12}, \
+    {   1311,13}, {    703,12}, {   1407,14}, {    383,13}, \
+    {    767,12}, {   1599,13}, {    831,12}, {   1727,13}, \
+    {    895,12}, {   1791,13}, {    959,12}, {   1919,15}, \
+    {    255,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2239,13}, {   1151,12}, {   2303,13}, \
+    {   1215,14}, {    639,13}, {   1279,12}, {   2559,13}, \
+    {   1471,14}, {    767,13}, {   1727,14}, {    895,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2239,14}, \
+    {   1151,13}, {   2495,14}, {   1279,13}, {   2623,14}, \
+    {   1407,15}, {    767,14}, {   1663,13}, {   3455,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2175,13}, \
+    {   4479,14}, {   2431,15}, {   1279,14}, {   2943,15}, \
+    {   1535,14}, {   3455,15}, {   1791,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 153
+#define SQR_FFT_THRESHOLD                 6272
+
+#define MULLO_BASECASE_THRESHOLD            39
+#define MULLO_DC_THRESHOLD                   0  /* never mpn_mullo_basecase */
+#define MULLO_MUL_N_THRESHOLD            17050
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 134
+#define SQRLO_SQR_THRESHOLD              12322
+
+#define DC_DIV_QR_THRESHOLD                 73
+#define DC_DIVAPPR_Q_THRESHOLD             262
+#define DC_BDIV_QR_THRESHOLD               111
+#define DC_BDIV_Q_THRESHOLD                315
+
+#define INV_MULMOD_BNM1_THRESHOLD           92
+#define INV_NEWTON_THRESHOLD                15
+#define INV_APPR_THRESHOLD                  17
+
+#define BINV_NEWTON_THRESHOLD              280
+#define REDC_1_TO_REDC_2_THRESHOLD           0  /* always */
+#define REDC_2_TO_REDC_N_THRESHOLD         172
+
+#define MU_DIV_QR_THRESHOLD               1470
+#define MU_DIVAPPR_Q_THRESHOLD            1210
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD              1566
+#define MU_BDIV_Q_THRESHOLD               1787
+
+#define POWM_SEC_TABLE  3,22,139,1867
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        42
+#define SET_STR_DC_THRESHOLD              1339
+#define SET_STR_PRECOMPUTE_THRESHOLD      3934
+
+#define FAC_DSC_THRESHOLD                  866
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    3  /* 13.73% faster than 1 */
+#define HGCD_THRESHOLD                     129
+#define HGCD_APPR_THRESHOLD                202
+#define HGCD_REDUCE_THRESHOLD             4455
+#define GCD_DC_THRESHOLD                   658
+#define GCDEXT_DC_THRESHOLD                469
+#define JACOBI_BASE_METHOD                   2  /* 0.62% faster than 4 */
+
+/* Tuneup completed successfully, took 199042 seconds */
diff --git a/third_party/gmp/mpn/ia64/hamdist.asm b/third_party/gmp/mpn/ia64/hamdist.asm
new file mode 100644
index 0000000..477df4c
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/hamdist.asm
@@ -0,0 +1,365 @@
+dnl  IA-64 mpn_hamdist -- mpn hamming distance.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       2
+C Itanium 2:     1
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`vp', `r33')
+define(`n', `r34')
+
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
+define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
+define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
+define(`s',`r8')
+
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+	.prologue
+ifdef(`HAVE_ABI_32',
+`	addp4		up = 0, up		C			M I
+	addp4		vp = 0, vp		C			M I
+	zxt4		n = n			C			I
+	;;
+')
+
+ {.mmi;	ld8		r10 = [up], 8		C load first ulimb	M01
+	ld8		r11 = [vp], 8		C load first vlimb	M01
+	mov.i		r2 = ar.lc		C save ar.lc		I0
+}{.mmi;	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p0 = 4, n		C small count?		M I
+	add		n = -5, n		C			M I
+	;;
+}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk		.Lb01			C			B
+  (p7)	br.dptk		.Lb10			C			B
+  (p8)	br.dptk		.Lb11			C			B
+}
+
+
+.Lb00:	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	xor		x0 = r10, r11		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x2 = u2, v2		C			M I
+	mov		s = 0			C			M I
+  (p15)	br.cond.dptk	.grt4			C			B
+	;;
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c1 = x1			C			I0
+	;;
+	popcnt		c2 = x2			C			I0
+	br		.Lcj4			C			B
+
+.grt4:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	xor		x2 = u2, v2		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dpnt	.grt8			C			B
+
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	br		.Lcj8			C			B
+
+.grt8:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	br		.LL00			C			B
+
+
+.Lb01:	xor		x3 = r10, r11		C			M I
+	shr.u		n = n, 2		C			I0
+  (p15)	br.cond.dptk	.grt1			C			B
+	;;
+	popcnt		r8 = x3			C			I0
+	br.ret.sptk.many b0			C			B
+
+.grt1:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dpnt	.grt5			C			B
+
+	xor		x1 = u1, v1		C			M I
+	;;
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c1 = x1			C			I0
+	br		.Lcj5			C			B
+
+.grt5:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dpnt	.Loop			C			B
+	br		.Lend			C			B
+
+
+.Lb10:	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x2 = r10, r11		C			M I
+  (p15)	br.cond.dptk	.grt2			C			B
+	;;
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c2 = x2			C			I0
+	;;
+	popcnt		c3 = x3			C			I0
+	;;
+	add		s = c2, c3		C			M I
+	br.ret.sptk.many b0			C			B
+
+.grt2:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dptk	.grt6			C			B
+
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	br		.Lcj6			C			B
+
+.grt6:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	br		.LL10			C			B
+
+
+.Lb11:	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	xor		x1 = r10, r11		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x2 = u2, v2		C			M I
+  (p15)	br.cond.dptk	.grt3			C			B
+	;;
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c1 = x1			C			I0
+	;;
+	popcnt		c2 = x2			C			I0
+	;;
+	popcnt		c3 = x3			C			I0
+	;;
+	add		s = c1, c2		C			M I
+	;;
+	add		s = s, c3		C			M I
+	br.ret.sptk.many b0			C			B
+
+.grt3:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dptk	.grt7			C			B
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	br		.Lcj7			C			B
+
+.grt7:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	br		.LL11			C			B
+
+
+	ALIGN(32)
+.Loop:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	add		s = s, c3		C			M I
+	xor		x1 = u1, v1		C			M I
+	nop.b		1			C			-
+	;;
+.LL00:	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	add		s = s, c0		C			M I
+	xor		x2 = u2, v2		C			M I
+	nop.b		1			C			-
+	;;
+.LL11:	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	add		s = s, c1		C			M I
+	xor		x3 = u3, v3		C			M I
+	nop.b		1			C			-
+	;;
+.LL10:	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	add		s = s, c2		C			M I
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dptk	.Loop			C			B
+	;;
+
+.Lend:	popcnt		c2 = x2			C			I0
+	add		s = s, c3		C			M I
+	xor		x1 = u1, v1		C			M I
+	;;
+.Lcj8:	popcnt		c3 = x3			C			I0
+	add		s = s, c0		C			M I
+	xor		x2 = u2, v2		C			M I
+	;;
+.Lcj7:	popcnt		c0 = x0			C			I0
+	add		s = s, c1		C			M I
+	xor		x3 = u3, v3		C			M I
+	;;
+.Lcj6:	popcnt		c1 = x1			C			I0
+	add		s = s, c2		C			M I
+	;;
+.Lcj5:	popcnt		c2 = x2			C			I0
+	add		s = s, c3		C			M I
+	;;
+.Lcj4:	popcnt		c3 = x3			C			I0
+	add		s = s, c0		C			M I
+	;;
+	add		s = s, c1		C			M I
+	;;
+	add		s = s, c2		C			M I
+	;;
+	add		s = s, c3		C			M I
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/ia64-defs.m4 b/third_party/gmp/mpn/ia64/ia64-defs.m4
new file mode 100644
index 0000000..f71d280
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/ia64-defs.m4
@@ -0,0 +1,147 @@
+divert(-1)
+
+
+dnl  Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  ia64 assembler comments are C++ style "//" to the end of line.  gas
+dnl  also accepts "#" as a comment, if it's the first non-blank on a line.
+dnl
+dnl  BSD m4 can't handle a multi-character comment like "//" (see notes in
+dnl  mpn/asm-defs.m4).  For now the default "#" is left, but with care taken
+dnl  not to put any macros after "foo#" (since of course they won't expand).
+
+
+define(`ASM_START',
+m4_assert_numargs(0)
+`')
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  32-byte alignment is used for the benefit of itanium-2, where the code
+dnl  fetcher will only take 2 bundles from a 32-byte aligned target.  At
+dnl  16mod32 it only reads 1 in the first cycle.  This might not make any
+dnl  difference if the rotate buffers are full or there's other work holding
+dnl  up execution, but we use 32-bytes to give the best chance of peak
+dnl  throughput.
+dnl
+dnl  We can use .align here despite the gas bug noted in mpn/ia64/README,
+dnl  since we're not expecting to execute across a PROLOGUE(), at least not
+dnl  currently.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+	`
+	.text
+	.align	32
+	.global	$1#
+	.proc	$1#
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+	`
+	.endp	$1#
+')
+
+define(`DATASTART',
+	`dnl
+	DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+
+dnl  Usage: ALIGN(bytes)
+dnl
+dnl  Emit a ".align" directive.  "bytes" is eval()ed, so can be an
+dnl  expression.
+dnl
+dnl  This version overrides the definition in mpn/asm-defs.m4.  We suppress
+dnl  any .align if the gas byte-swapped-nops bug was detected by configure
+dnl  GMP_ASM_IA64_ALIGN_OK.
+
+define(`ALIGN',
+m4_assert_numargs(1)
+m4_assert_defined(`IA64_ALIGN_OK')
+`ifelse(IA64_ALIGN_OK,no,,
+`.align	eval($1)')')
+
+
+dnl  Usage: ASSERT([pr] [,code])
+dnl
+dnl  Require that the given predicate register is true after executing the
+dnl  test code.  For example,
+dnl
+dnl         ASSERT(p6,
+dnl         `       cmp.eq  p6,p0 = r3, r4')
+dnl
+dnl  If the predicate register argument is empty then nothing is tested, the
+dnl  code is just executed.  This can be used for setups required by later
+dnl  ASSERTs.  The code argument can be omitted to just test a predicate
+dnl  with no special setup code.
+dnl
+dnl  For convenience, stops are inserted before and after the code emitted.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`	;;
+ifelse(`$2',,,
+`$2
+	;;
+')
+ifelse(`$1',,,
+`($1)	br	.LASSERTok`'ASSERT_label_counter ;;
+	cmp.ne	p6,p6 = r0, r0	C illegal instruction
+	;;
+.LASSERTok`'ASSERT_label_counter:
+define(`ASSERT_label_counter',eval(ASSERT_label_counter+1))
+')
+')')
+define(`ASSERT_label_counter',1)
+
+define(`getfsig', `getf.sig')
+define(`setfsig', `setf.sig')
+define(`cmpeq',   `cmp.eq')
+define(`cmpne',   `cmp.ne')
+define(`cmpltu',  `cmp.ltu')
+define(`cmpleu',  `cmp.leu')
+define(`cmpgtu',  `cmp.gtu')
+define(`cmpgeu',  `cmp.geu')
+define(`cmple',   `cmp.le')
+define(`cmpgt',   `cmp.gt')
+define(`cmpeqor', `cmp.eq.or')
+define(`cmpequc', `cmp.eq.unc')
+
+divert
diff --git a/third_party/gmp/mpn/ia64/invert_limb.asm b/third_party/gmp/mpn/ia64/invert_limb.asm
new file mode 100644
index 0000000..5effdda
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/invert_limb.asm
@@ -0,0 +1,105 @@
+dnl  IA-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
+
+dnl  Copyright 2000, 2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C d = r32
+
+C           cycles
+C Itanium:    74
+C Itanium 2:  50+6
+
+C It should be possible to avoid the xmpy.hu and the following tests by
+C explicitly chopping in the last fma.  That would save about 10 cycles.
+
+ASM_START()
+	.sdata
+	.align 16
+ifdef(`HAVE_DOUBLE_IEEE_LITTLE_ENDIAN',`
+.LC0:	data4 0x00000000, 0x80000000, 0x0000403f, 0x00000000	C 2^64
+.LC1:	data4 0x00000000, 0x80000000, 0x0000407f, 0x00000000	C 2^128
+
+',`ifdef(`HAVE_DOUBLE_IEEE_BIG_ENDIAN',`
+.LC0:	data4 0x403f8000, 0x00000000, 0x00000000, 0x00000000	C 2^64
+.LC1:	data4 0x407f8000, 0x00000000, 0x00000000, 0x00000000	C 2^128
+
+',`m4_error(`Oops, need to know float endianness
+')')')
+
+
+PROLOGUE(mpn_invert_limb)
+		C 00
+	addl		r14 = @gprel(.LC0), gp
+	addl		r15 = @gprel(.LC1), gp
+	setf.sig	f7 = r32
+	add		r9 = r32, r32		C check for d = 2^63
+	;;	C 01
+	ldfe		f10 = [r14]		C 2^64
+	ldfe		f8 = [r15]		C 2^128
+	cmp.eq		p6, p0 = 0, r9		C check for d = 2^63
+	mov		r8 = -1			C retval for 2^63
+   (p6)	br.ret.spnt.many b0
+	;;	C 07
+	fmpy.s1		f11 = f7, f10		C f11 = d * 2^64
+	fnma.s1		f6 = f7, f10, f8	C f6 = 2^128 - d * 2^64
+	;;	C 11
+	frcpa.s1	f8, p6 = f6, f7
+	;;	C 15
+   (p6)	fnma.s1		f9 = f7, f8, f1
+   (p6)	fmpy.s1		f10 = f6, f8
+	;;	C 19
+   (p6)	fmpy.s1		f11 = f9, f9
+   (p6)	fma.s1		f10 = f9, f10, f10
+	;;	C 23
+   (p6)	fma.s1		f8 = f9, f8, f8
+   (p6)	fma.s1		f9 = f11, f10, f10
+	;;	C 27
+   (p6)	fma.s1		f8 = f11, f8, f8
+   (p6)	fnma.s1		f10 = f7, f9, f6
+	;;	C 31
+   (p6)	fma.s1		f8 = f10, f8, f9
+	;;	C 35
+	fcvt.fxu.trunc.s1 f8 = f8
+	;;	C 39
+	getf.sig	r8 = f8
+	xmpy.hu		f10 = f8, f7		C di * d
+	;;	C 43
+	getf.sig	r14 = f10
+	andcm		r9 = -1, r32		C one's complement
+	;;	C 48
+	cmp.ltu		p6, p0 = r9, r14	C got overflow?
+	;;	C 49
+   (p6)	add		r8 = -1, r8		C adjust di down
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/logops_n.asm b/third_party/gmp/mpn/ia64/logops_n.asm
new file mode 100644
index 0000000..e4a2f61
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/logops_n.asm
@@ -0,0 +1,292 @@
+dnl  IA-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      2
+C Itanium 2:    1
+
+C TODO
+C  * Use rp,rpx scheme of aors_n.asm to allow parallel stores (useful in
+C    wind-down code).
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n', `r35')
+
+ifdef(`OPERATION_and_n',
+`	define(`func',`mpn_and_n')
+	define(`logop',		`and	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_andn_n',
+`	define(`func',`mpn_andn_n')
+	define(`logop',		`andcm	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_nand_n',
+`	define(`func',`mpn_nand_n')
+	define(`logop',		`and	$1 = $2, $3')
+	define(`notormov',	`sub	$1 = -1, $2')')
+ifdef(`OPERATION_ior_n',
+`	define(`func',`mpn_ior_n')
+	define(`logop',		`or	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_iorn_n',
+`	define(`func',`mpn_iorn_n')
+	define(`logop',		`andcm	$1 = $3, $2')
+	define(`notormov',	`sub	$1 = -1, $2')')
+ifdef(`OPERATION_nior_n',
+`	define(`func',`mpn_nior_n')
+	define(`logop',		`or	$1 = $2, $3')
+	define(`notormov',	`sub	$1 = -1, $2')')
+ifdef(`OPERATION_xor_n',
+`	define(`func',`mpn_xor_n')
+	define(`logop',		`xor	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_xnor_n',
+`	define(`func',`mpn_xnor_n')
+	define(`logop',		`xor	$1 = $2, $3')
+	define(`notormov',	`sub	$1 = -1, $2')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4	rp = 0, rp			C			M I
+	addp4	up = 0, up			C			M I
+	addp4	vp = 0, vp			C			M I
+	nop.m		0
+	nop.m		0
+	zxt4	n = n				C			I
+	;;
+')
+{.mmi
+	ld8		r10 = [up], 8		C			M
+	ld8		r11 = [vp], 8		C			M
+	mov.i		r2 = ar.lc		C			I0
+}
+{.mmi
+	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p14 = 4, n		C			M I
+	shr.u		n = n, 2		C			I0
+	;;
+}
+{.mmi
+	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}
+{.bbb
+   (p6)	br.dptk		.Lb01			C			B
+   (p7)	br.dptk		.Lb10			C			B
+   (p8)	br.dptk		.Lb11			C			B
+}
+
+.Lb00:	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	add		n = -2, n		C			M I
+	;;
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	;;
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+  (p15)	br.cond.dpnt	.grt4			C			B
+
+	logop(		r14, r10, r11)		C			M I
+	;;
+	logop(		r15, r17, r21)		C			M I
+	notormov(	r8, r14)		C			M I
+	br		.Lcj4			C			B
+
+.grt4:	logop(		r14, r10, r11)		C			M I
+	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	;;
+	logop(		r15, r17, r21)		C			M I
+	ld8		r17 = [up], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	notormov(	r8, r14)		C			M I
+	ld8		r21 = [vp], 8		C			M
+	br		.LL00			C			B
+
+.Lb01:	add		n = -1, n		C			M I
+	logop(		r15, r10, r11)		C			M I
+  (p15)	br.cond.dpnt	.grt1			C			B
+	;;
+
+	notormov(	r9, r15)		C			M I
+	br		.Lcj1			C			B
+
+.grt1:	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	;;
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	;;
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+	br.cloop.dptk	.grt5			C			B
+	;;
+
+	logop(		r14, r16, r20)		C			M I
+	notormov(	r9, r15)		C			M I
+	br		.Lcj5			C			B
+
+.grt5:	logop(		r14, r16, r20)		C			M I
+	ld8		r16 = [up], 8		C			M
+	notormov(	r9, r15)		C			M I
+	ld8		r20 = [vp], 8		C			M
+	br		.LL01			C			B
+
+.Lb10:	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+  (p15)	br.cond.dpnt	.grt2			C			B
+
+	logop(		r14, r10, r11)		C			M I
+	;;
+	logop(		r15, r19, r23)		C			M I
+	notormov(	r8, r14)		C			M I
+	br		.Lcj2			C			B
+
+.grt2:	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	add		n = -1, n		C			M I
+	;;
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	logop(		r14, r10, r11)		C			M I
+	;;
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	;;
+	logop(		r15, r19, r23)		C			M I
+	ld8		r19 = [up], 8		C			M
+	notormov(	r8, r14)		C			M I
+	ld8		r23 = [vp], 8		C			M
+	br.cloop.dptk	.Loop			C			B
+	br		.Lcj6			C			B
+
+.Lb11:	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	add		n = -1, n		C			M I
+	;;
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+	logop(		r15, r10, r11)		C			M I
+  (p15)	br.cond.dpnt	.grt3			C			B
+	;;
+
+	logop(		r14, r18, r22)		C			M I
+	notormov(	r9, r15)		C			M I
+	br		.Lcj3			C			B
+
+.grt3:	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	;;
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	;;
+	logop(		r14, r18, r22)		C			M I
+	ld8		r18 = [up], 8		C			M
+	notormov(	r9, r15)		C			M I
+	ld8		r22 = [vp], 8		C			M
+	br		.LL11			C			B
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Loop:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r16, r20)		C			M I
+	notormov(	r9, r15)		C			M I
+	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	nop.b		0
+	;;
+.LL01:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r17, r21)		C			M I
+	notormov(	r8, r14)		C			M I
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	nop.b		0
+	;;
+.LL00:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r18, r22)		C			M I
+	notormov(	r9, r15)		C			M I
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	nop.b		0
+	;;
+.LL11:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r19, r23)		C			M I
+	notormov(	r8, r14)		C			M I
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+	br.cloop.dptk	.Loop	;;		C			B
+C *** MAIN LOOP END ***
+
+.Lcj6:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r16, r20)		C			M I
+	notormov(	r9, r15)		C			M I
+	;;
+.Lcj5:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r17, r21)		C			M I
+	notormov(	r8, r14)		C			M I
+	;;
+.Lcj4:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r18, r22)		C			M I
+	notormov(	r9, r15)		C			M I
+	;;
+.Lcj3:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r19, r23)		C			M I
+	notormov(	r8, r14)		C			M I
+	;;
+.Lcj2:	st8		[rp] = r8, 8		C			M
+	notormov(	r9, r15)		C			M I
+	;;
+.Lcj1:	st8		[rp] = r9, 8		C			M
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/lorrshift.asm b/third_party/gmp/mpn/ia64/lorrshift.asm
new file mode 100644
index 0000000..694aaf0
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/lorrshift.asm
@@ -0,0 +1,358 @@
+dnl  IA-64 mpn_lshift/mpn_rshift.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      2
+C Itanium 2:    1
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C The ld8 scheduling should probably be decreased to make the function smaller.
+C Good lfetch  will make sure we never stall anyway.
+
+C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
+C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
+C in the prologue.
+
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+ifdef(`OPERATION_lshift',`
+	define(`FSH',`shl')
+	define(`BSH',`shr.u')
+	define(`UPD',`-8')
+	define(`POFF',`-512')
+	define(`PUPD',`-32')
+	define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+	define(`FSH',`shr.u')
+	define(`BSH',`shl')
+	define(`UPD',`8')
+	define(`POFF',`512')
+	define(`PUPD',`32')
+	define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4	rp = 0, rp		C			M I
+	addp4	up = 0, up		C		M I
+	sxt4	n = n			C		M I
+	nop.m		0
+	nop.m		0
+	zxt4	cnt = cnt		C		I
+	;;
+')
+
+ {.mmi;	cmp.lt	p14, p15 = 4, n		C		M I
+	and	r14 = 3, n		C		M I
+	mov.i	r2 = ar.lc		C		I0
+}{.mmi;	add	r15 = -1, n		C		M I
+	sub	tnc = 64, cnt		C		M I
+	add	r16 = -5, n
+	;;
+}{.mmi;	cmp.eq	p6, p0 = 1, r14		C		M I
+	cmp.eq	p7, p0 = 2, r14		C		M I
+	shr.u	n = r16, 2		C		I0
+}{.mmi;	cmp.eq	p8, p0 = 3, r14		C		M I
+ifdef(`OPERATION_lshift',
+`	shladd	up = r15, 3, up		C		M I
+	shladd	rp = r15, 3, rp')	C		M I
+	;;
+}{.mmi;	add	r11 = POFF, up		C		M I
+	ld8	r10 = [up], UPD		C		M01
+	mov.i	ar.lc = n		C		I0
+}{.bbb;
+   (p6)	br.dptk	.Lb01
+   (p7)	br.dptk	.Lb10
+   (p8)	br.dptk	.Lb11
+	;; }
+
+.Lb00:	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	FSH	r24 = r10, cnt
+	BSH	r25 = r19, tnc
+  (p14)	br.cond.dptk	.grt4
+	;;
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	BSH	r23 = r10, tnc
+	br	.Lr4
+
+.grt4:	ld8	r18 = [up], UPD
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r17, cnt
+	BSH	r23 = r18, tnc
+	;;
+	or	r14 = r25, r24
+	ld8	r17 = [up], UPD
+	br.cloop.dpnt	.Ltop
+	br	.Lbot
+
+.Lb01:
+  (p15)	BSH	r8 = r10, tnc		C function return value	I
+  (p15)	FSH	r22 = r10, cnt		C		I
+  (p15)	br.cond.dptk	.Lr1		C return	B
+
+.grt1:	ld8	r18 = [up], UPD
+	;;
+	ld8	r19 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r10, cnt
+	BSH	r23 = r18, tnc
+	;;
+	ld8	r17 = [up], UPD
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	br.cloop.dpnt	.grt5
+	;;
+	or	r15 = r23, r22
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	br	.Lr5
+
+.grt5:	ld8	r18 = [up], UPD
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r15 = r23, r22
+	ld8	r16 = [up], UPD
+	br	.LL01
+
+
+.Lb10:	ld8	r17 = [up], UPD
+  (p14)	br.cond.dptk	.grt2
+
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	FSH	r20 = r10, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r21, r20
+	FSH	r22 = r17, cnt
+	br	.Lr2			C return
+
+.grt2:	ld8	r18 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r10, cnt
+	BSH	r21 = r17, tnc
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r17, cnt
+	BSH	r23 = r18, tnc
+	;;
+ {.mmi;	ld8	r17 = [up], UPD
+	or	r14 = r21, r20
+	FSH	r24 = r18, cnt
+}{.mib;	nop	0
+	BSH	r25 = r19, tnc
+	br.cloop.dpnt	.grt6
+	;; }
+
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	br	.Lr6
+
+.grt6:	ld8	r18 = [up], UPD
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	br	.LL10
+
+
+.Lb11:	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+  (p14)	br.cond.dptk	.grt3
+	;;
+
+	FSH	r26 = r10, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r15 = r27, r26
+	FSH	r22 = r17, cnt
+	br	.Lr3			C return
+
+.grt3:	ld8	r18 = [up], UPD
+	FSH	r26 = r10, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r17, cnt
+	BSH	r23 = r18, tnc
+	;;
+	ld8	r17 = [up], UPD
+	br.cloop.dpnt	.grt7
+
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	br	.Lr7
+
+.grt7:	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	ld8	r18 = [up], UPD
+	br	.LL11
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Ltop:
+ {.mmi;	st8	[rp] = r14, UPD		C M2
+	or	r15 = r27, r26		C M3
+	FSH	r24 = r18, cnt		C I0
+}{.mmi;	ld8	r18 = [up], UPD		C M1
+	lfetch	[r11], PUPD
+	BSH	r25 = r19, tnc		C I1
+	;; }
+.LL11:
+ {.mmi;	st8	[rp] = r15, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop.m	0
+	BSH	r27 = r16, tnc
+	;; }
+.LL10:
+ {.mmi;	st8	[rp] = r14, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop.m	0
+	BSH	r21 = r17, tnc
+	;; }
+.LL01:
+ {.mmi;	st8	[rp] = r15, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	.Ltop
+	;; }
+C *** MAIN LOOP END ***
+
+.Lbot:
+ {.mmi;	st8	[rp] = r14, UPD
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mib;	nop	0
+	BSH	r25 = r19, tnc
+	nop	0
+	;; }
+.Lr7:
+ {.mmi;	st8	[rp] = r15, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mib;	nop	0
+	BSH	r27 = r16, tnc
+	nop	0
+	;; }
+.Lr6:
+ {.mmi;	st8	[rp] = r14, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mib;	nop	0
+	BSH	r21 = r17, tnc
+	nop	0
+	;; }
+.Lr5:	st8	[rp] = r15, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	;;
+.Lr4:	st8	[rp] = r14, UPD
+	or	r15 = r27, r26
+	;;
+.Lr3:	st8	[rp] = r15, UPD
+	or	r14 = r21, r20
+	;;
+.Lr2:	st8	[rp] = r14, UPD
+	;;
+.Lr1:	st8	[rp] = r22, UPD		C		M23
+	mov	ar.lc = r2		C		I0
+	br.ret.sptk.many b0		C		B
+EPILOGUE(func)
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/lshiftc.asm b/third_party/gmp/mpn/ia64/lshiftc.asm
new file mode 100644
index 0000000..e8cec87
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/lshiftc.asm
@@ -0,0 +1,463 @@
+dnl  IA-64 mpn_lshiftc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1.25
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C The ld8 scheduling should probably be decreased to make the function smaller.
+C Good lfetch  will make sure we never stall anyway.
+
+C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
+C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
+C in the prologue.
+
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+define(`FSH',`shl')
+define(`BSH',`shr.u')
+define(`UPD',`-8')
+define(`POFF',`-512')
+define(`PUPD',`-32')
+define(`func',`mpn_lshiftc')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4	rp = 0, rp		C				M I
+	addp4	up = 0, up		C				M I
+	sxt4	n = n			C				M I
+	nop.m		0
+	nop.m		0
+	zxt4	cnt = cnt		C				I
+	;;
+')
+
+ {.mmi;	nop	0			C				M I
+	and	r14 = 3, n		C				M I
+	mov.i	r2 = ar.lc		C				I0
+}{.mmi;	add	r15 = -1, n		C				M I
+	sub	tnc = 64, cnt		C				M I
+	nop	0
+	;;
+}{.mmi;	cmp.eq	p6, p0 = 1, r14		C				M I
+	cmp.eq	p7, p0 = 2, r14		C				M I
+	shr.u	n = r15, 2		C				I0
+}{.mmi;	cmp.eq	p8, p0 = 3, r14		C				M I
+	shladd	up = r15, 3, up		C				M I
+	shladd	rp = r15, 3, rp		C				M I
+	;;
+}{.mmi;	add	r11 = POFF, up		C				M I
+	ld8	r10 = [up], UPD		C				M01
+	mov.i	ar.lc = n		C				I0
+}{.bbb;
+   (p6)	br.dptk	.Lb01
+   (p7)	br.dptk	.Lb10
+   (p8)	br.dptk	.Lb11
+	;; }
+
+.Lb00:
+	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc
+	br.cloop.dptk	L(gt4)
+	;;
+	FSH	r24 = r10, cnt
+	BSH	r25 = r19, tnc
+	;;
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	;;
+	or	r15 = r27, r26
+	sub	r31 = -1, r14
+	br	.Lr4
+
+L(gt4):
+ {.mmi;	nop	0
+	nop	0
+	FSH	r24 = r10, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi;	nop	0
+	nop	0
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop	0
+	BSH	r27 = r16, tnc
+	;; }
+ {.mmi;	nop	0
+	nop	0
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop	0
+	BSH	r21 = r17, tnc
+	;; }
+ {.mmi;	nop	0
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt8)
+	;; }
+ {.mmi;	nop	0
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r25 = r19, tnc
+	br	.Lr8 }
+
+L(gt8):
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	ld8	r18 = [up], UPD
+	sub	r31 = -1, r14
+	BSH	r25 = r19, tnc
+	br	.LL00
+
+.Lb01:
+	br.cloop.dptk	L(gt1)
+	;;
+	BSH	r8 = r10, tnc
+	FSH	r22 = r10, cnt
+	;;
+	sub	r31 = -1, r22
+	br	.Lr1
+	;;
+L(gt1):
+	ld8	r18 = [up], UPD
+	BSH	r8 = r10, tnc
+	FSH	r22 = r10, cnt
+	;;
+	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt5)
+	;;
+	nop	0
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	;;
+	nop	0
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	sub	r31 = -1, r15
+	br	.Lr5
+
+L(gt5):
+ {.mmi;	nop	0
+	nop	0
+	FSH	r24 = r18, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi;	nop	0
+	nop	0
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop	0
+	BSH	r27 = r16, tnc
+	;; }
+ {.mmi;	nop	0
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop	0
+	BSH	r21 = r17, tnc
+	;; }
+ {.mmi;	or	r14 = r25, r24
+	sub	r31 = -1, r15
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br	L(end)
+	;; }
+
+.Lb10:
+	ld8	r17 = [up], UPD
+	br.cloop.dptk	L(gt2)
+	;;
+	BSH	r8 = r10, tnc
+	FSH	r20 = r10, cnt
+	;;
+	BSH	r21 = r17, tnc
+	FSH	r22 = r17, cnt
+	;;
+	or	r14 = r21, r20
+	;;
+	sub	r31 = -1, r14
+	br	.Lr2
+	;;
+L(gt2):
+	ld8	r18 = [up], UPD
+	BSH	r8 = r10, tnc
+	FSH	r20 = r10, cnt
+	;;
+	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	BSH	r21 = r17, tnc
+	FSH	r22 = r17, cnt
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt6)
+	;;
+	nop	0
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	;;
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+ {.mmi;	nop	0
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r21 = r17, tnc
+	br	.Lr6
+	;; }
+L(gt6):
+ {.mmi;	nop	0
+	nop	0
+	FSH	r24 = r18, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi; nop   0
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop	0
+	BSH	r27 = r16, tnc
+	;; }
+ {.mmi;	or	r15 = r23, r22
+	sub	r31 = -1, r14
+	FSH	r20 = r16, cnt
+}{.mib;	ld8	r16 = [up], UPD
+	BSH	r21 = r17, tnc
+	br	.LL10
+}
+
+.Lb11:
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc
+	FSH	r26 = r10, cnt
+	br.cloop.dptk	L(gt3)
+	;;
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	FSH	r22 = r17, cnt
+	;;
+	or	r15 = r27, r26
+	;;
+	or	r14 = r21, r20
+	sub	r31 = -1, r15
+	br	.Lr3
+	;;
+L(gt3):
+	ld8	r18 = [up], UPD
+	;;
+	ld8	r19 = [up], UPD
+	BSH	r27 = r16, tnc
+	;;
+ {.mmi;	nop	0
+	nop	0
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop	0
+	BSH	r21 = r17, tnc
+	;;
+}{.mmi;	nop	0
+	nop	0
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt7)
+	;; }
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	;;
+ {.mmi;	nop	0
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mib;	sub	r31 = -1, r15
+	BSH	r27 = r16, tnc
+	br	.Lr7
+}
+L(gt7):
+ {.mmi;	nop	0
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi;	or	r14 = r21, r20
+	sub	r31 = -1, r15
+	FSH	r26 = r19, cnt
+}{.mib;	ld8	r19 = [up], UPD
+	BSH	r27 = r16, tnc
+	br	.LL11
+}
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):
+.LL01:
+ {.mmi;	st8	[rp] = r31, UPD		C M2
+	or	r15 = r27, r26		C M3
+	FSH	r24 = r18, cnt		C I0
+}{.mmi;	ld8	r18 = [up], UPD		C M0
+	sub	r31 = -1, r14		C M1
+	BSH	r25 = r19, tnc		C I1
+	;; }
+.LL00:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	sub	r31 = -1, r15
+	BSH	r27 = r16, tnc
+	;; }
+.LL11:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	sub	r31 = -1, r14
+	BSH	r21 = r17, tnc
+	;; }
+.LL10:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+}{.mmi;	ld8	r17 = [up], UPD
+	sub	r31 = -1, r15
+	BSH	r23 = r18, tnc
+	;; }
+L(end):	lfetch		[r11], PUPD
+	br.cloop.dptk	L(top)
+C *** MAIN LOOP END ***
+
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r25 = r19, tnc
+	nop	0
+	;; }
+.Lr8:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mib;	sub	r31 = -1, r15
+	BSH	r27 = r16, tnc
+	nop	0
+	;; }
+.Lr7:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r21 = r17, tnc
+	nop	0
+	;; }
+.Lr6:	st8	[rp] = r31, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	sub	r31 = -1, r15
+	;;
+.Lr5:	st8	[rp] = r31, UPD
+	or	r15 = r27, r26
+	sub	r31 = -1, r14
+	;;
+.Lr4:	st8	[rp] = r31, UPD
+	or	r14 = r21, r20
+	sub	r31 = -1, r15
+	;;
+.Lr3:	st8	[rp] = r31, UPD
+	sub	r31 = -1, r14
+	;;
+.Lr2:	st8	[rp] = r31, UPD
+	sub	r31 = -1, r22
+	;;
+.Lr1:	st8	[rp] = r31, UPD		C				M23
+	mov	ar.lc = r2		C				I0
+	br.ret.sptk.many b0		C				B
+EPILOGUE(func)
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/mod_34lsub1.asm b/third_party/gmp/mpn/ia64/mod_34lsub1.asm
new file mode 100644
index 0000000..7789117
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mod_34lsub1.asm
@@ -0,0 +1,237 @@
+dnl  IA-64 mpn_mod_34lsub1
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1
+
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`n',  `r33')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
+define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
+define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
+
+C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
+C with a more sophisticated implementation.  If we're really crazy, we could
+C super-unroll, storing carries just in predicate registers, then copy them to
+C a general register, and population count them from there.  That'd bring us
+C close to 3 insn/limb, for nearly 0.5 c/l.
+
+C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
+C We therefore use a plain while-style loop:
+C	add		n = -3, n
+C	cmp.le		p9, p0 = 3, n
+C  (p9)	br.cond		.Loop
+C Alternatively, we could table n/3 for, say, n < 256, and predicate the
+C 16-cycle code.
+
+C The summing-up code at the end was written quickly, and could surely be
+C vastly improved.
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4		up = 0, up		C			M I
+	nop.m		0
+	zxt4		n = n			C			I
+	;;
+')
+
+ifelse(0,1,`
+	movl		r14 = 0xAAAAAAAAAAAAAAAB
+	;;
+	setf.sig	f6 = r14
+	setf.sig	f7 = r33
+	;;
+	xmpy.hu		f6 = f6, f7
+	;;
+	getf.sig	r8 = f6
+	;;
+	shr.u		r8 = r8, 1		C Loop count
+	;;
+	mov.i		ar.lc = r8
+')
+
+	ld8	u0 = [up], 8
+	cmp.ne	p9, p0 = 1, n
+  (p9)	br	L(gt1)
+	;;
+	shr.u	r8 = u0, 48
+	dep.z	r27 = u0, 0, 48
+	;;
+	add	r8 = r8, r27
+	br.ret.sptk.many b0
+
+
+L(gt1):
+ {.mmi;	nop.m	0
+	mov	a0 = 0
+	add	n = -2, n
+}{.mmi;	mov	c0 = 0
+	mov	c1 = 0
+	mov	c2 = 0
+	;;
+}{.mmi;	ld8	u1 = [up], 8
+	mov	a1 = 0
+	cmp.ltu	p6, p0 = r0, r0		C clear p6
+}{.mmb;	cmp.gt	p9, p0 = 3, n
+	mov	a2 = 0
+  (p9)	br.cond.dptk	L(end)
+	;;
+}
+	ALIGN(32)
+L(top):
+ {.mmi;	ld8	u2 = [up], 8
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+}{.mmb;	sub	a0 = a0, u0
+	add	n = -3, n
+	nop.b	0
+	;;
+}{.mmi;	ld8	u0 = [up], 8
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+}{.mmb;	sub	a1 = a1, u1
+	cmp.le	p9, p0 = 3, n
+	nop.b	0
+	;;
+}{.mmi;	ld8	u1 = [up], 8
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+}{.mmb;	sub	a2 = a2, u2
+	nop.m	0
+dnl	br.cloop.dptk	L(top)
+  (p9)	br.cond.dptk	L(top)
+	;;
+}
+L(end):
+	cmp.eq	p10, p0 = 0, n
+	cmp.eq	p11, p0 = 1, n
+  (p10)	br	L(0)
+
+L(2):
+ {.mmi;	ld8	u2 = [up], 8
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+}{.mmb;	sub	a0 = a0, u0
+	nop.m	0
+  (p11)	br	L(1)
+	;;
+}	ld8	u0 = [up], 8
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+	sub	a2 = a2, u2
+	;;
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+	sub	a0 = a0, u0
+	;;
+  (p7)	add	c1 = 1, c1
+	br	L(com)
+
+
+L(1):
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+	sub	a2 = a2, u2
+	;;
+  (p6)	add	c0 = 1, c0
+	br	L(com)
+
+
+L(0):
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+	sub	a0 = a0, u0
+	;;
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+
+L(com):
+C |     a2    |     a1    |     a0    |
+C |        |        |        |        |
+	shr.u	r24 = a0, 48		C 16 bits
+	shr.u	r25 = a1, 32		C 32 bits
+	shr.u	r26 = a2, 16		C 48 bits
+	;;
+	shr.u	r10 = c0, 48		C 16 bits, always zero
+	shr.u	r11 = c1, 32		C 32 bits
+	shr.u	r30 = c2, 16		C 48 bits
+	;;
+	dep.z	r27 = a0,  0, 48	C 48 bits
+	dep.z	r28 = a1, 16, 32	C 48 bits
+	dep.z	r29 = a2, 32, 16	C 48 bits
+	dep.z	r31 = c0,  0, 48	C 48 bits
+	dep.z	r14 = c1, 16, 32	C 48 bits
+	dep.z	r15 = c2, 32, 16	C 48 bits
+	;;
+ {.mmi;	add	r24 = r24, r25
+	add	r26 = r26, r27
+	add	r28 = r28, r29
+}{.mmi;	add	r10 = r10, r11
+	add	r30 = r30, r31
+	add	r14 = r14, r15
+	;;
+}
+	movl	r8 = 0xffffffffffff0
+	add	r24 = r24, r26
+	add	r10 = r10, r30
+	;;
+	add	r24 = r24, r28
+	add	r10 = r10, r14
+	;;
+	sub	r8 = r8, r24
+	;;
+	add	r8 = r8, r10
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/mode1o.asm b/third_party/gmp/mpn/ia64/mode1o.asm
new file mode 100644
index 0000000..14d5e81
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mode1o.asm
@@ -0,0 +1,342 @@
+dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
+
+dnl  Contributed to the GNU project by Kevin Ryde.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C            cycles/limb
+C Itanium:      15
+C Itanium 2:     8
+
+
+dnl  Usage: ABI32(`code')
+dnl
+dnl  Emit the given code only under HAVE_ABI_32.
+dnl
+define(ABI32,
+m4_assert_onearg()
+`ifdef(`HAVE_ABI_32',`$1')')
+
+
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C The modexact algorithm is usually conceived as a dependent chain
+C
+C	l = src[i] - c
+C	q = low(l * inverse)
+C	c = high(q*divisor) + (src[i]<c)
+C
+C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
+C separately (off the dependent chain) and using
+C
+C	q = low(c * inverse + si)
+C	c = high(q*divisor + c)
+C
+C This means the dependent chain is simply xma.l followed by xma.hu, for a
+C total 8 cycles/limb on itanium-2.
+C
+C The reason xma.hu works for the new c is that the low of q*divisor is
+C src[i]-c (being the whole purpose of the q generated, and it can be
+C verified algebraically).  If there was an underflow from src[i]-c, then
+C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
+C the same as the borrow bit (src[i]<c) gives in the first style shown.
+C
+C Incidentally, fcmp is not an option for treating src[i]-c, since it
+C apparently traps to the kernel for unnormalized operands like those used
+C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
+C cycles.
+C
+C
+C First Limb:
+C
+C The first limb uses q = (src[0]-c) * inverse shown in the first style.
+C This lets us get the first q as soon as the inverse is ready, without
+C going through si=s*inverse.  Basically at the start we have c and can use
+C it while waiting for the inverse, whereas for the second and subsequent
+C limbs it's the other way around, ie. we have the inverse and are waiting
+C for c.
+C
+C At .Lentry the first two instructions in the loop have been done already.
+C The load of f11=src[1] at the start (predicated on size>=2), and the
+C calculation of q by the initial different scheme.
+C
+C
+C Entry Sequence:
+C
+C In the entry sequence, the critical path is the calculation of the
+C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
+C established nice and early so the br.cloop's should predict perfectly.
+C And the load for the low limbs src[0] and src[1] can be initiated long
+C ahead of where they're needed.
+C
+C
+C Inverse Calculation:
+C
+C The initial 8-bit inverse is calculated using a table lookup.  If it hits
+C L1 (which is likely if we're called several times) then it should take a
+C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
+C the best approach, on balance.  It could be done bitwise, but that would
+C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
+C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
+C but that would be about 11 cycles.
+C
+C The table is not the same as binvert_limb_table, instead it's 256 bytes,
+C designed to be indexed by the low byte of the divisor.  The divisor is
+C always odd, so the relevant data is every second byte in the table.  The
+C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
+C cycle because it must go down I0, and we're using the first I0 slot to get
+C ip.  The extra 128 bytes of padding should be insignificant compared to
+C typical ia64 code bloat.
+C
+C Having the table in .text allows us to use IP-relative addressing,
+C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
+C IP-relative, it gets a linker relocation overflow on GNU/Linux.
+C
+C
+C Load Scheduling:
+C
+C In the main loop, the data loads are scheduled for an L2 hit, which means
+C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
+C any case that scheduling is achieved simply by doing the load (and xmpy.l
+C for "si") in the immediately preceding iteration.
+C
+C The main loop requires size >= 2, and we handle size==1 by an initial
+C br.cloop to enter the loop only if size>1.  Since ar.lc is established
+C early, this should predict perfectly.
+C
+C
+C Not done:
+C
+C Consideration was given to using a plain "(src[0]-c) % divisor" for
+C size==1, but cycle counting suggests about 50 for the sort of approach
+C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
+C L1 hits for their respective fetching.)
+C
+C Consideration was given to a test for high<divisor and replacing the last
+C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
+C Branching on high<divisor wouldn't be good since a mispredict would cost
+C more than the loop iteration saved, and the condition is of course data
+C dependent.  So the theory would be to shorten the loop count if
+C high<divisor, and predicate extra operations at the end.  That would mean
+C a gain of 6 when high<divisor, or a cost of 2 if not.
+C
+C Whether such a tradeoff is a win on average depends on assumptions about
+C how many bits in the high and the divisor.  If both are uniformly
+C distributed then high<divisor about 50% of the time.  But smallish
+C divisors (less chance of high<divisor) might be more likely from
+C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
+C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
+C for the moment, it's felt the gain is not really enough to be worth the
+C trouble.
+C
+C
+C Enhancement:
+C
+C Process two source limbs per iteration using a two-limb inverse and a
+C sequence like
+C
+C	ql  = low (c * il + sil)	quotient low limb
+C	qlc = high(c * il + sil)
+C	qh1 = low (c * ih + sih)	quotient high, partial
+C
+C	cl = high (ql * d + c)		carry out of low
+C	qh = low (qlc * 1 + qh1)	quotient high limb
+C
+C	new c = high (qh * d + cl)	carry out of high
+C
+C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
+C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
+C chain with 4 multiplies.  The bigger inverse would take extra time to
+C calculate, but a one limb iteration to handle an odd size could be done as
+C soon as 64-bits of inverse were ready.
+C
+C Perhaps this could even extend to a 3 limb inverse, which might promise 17
+C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
+C
+
+ASM_START()
+	.explicit
+
+	.text
+	.align	32
+.Ltable:
+data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
+data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
+data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
+data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
+data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
+data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
+data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
+data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
+data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
+data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
+data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
+data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
+data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
+data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
+data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
+data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
+
+
+PROLOGUE(mpn_modexact_1c_odd)
+
+	C r32	src
+	C r33	size
+	C r34	divisor
+	C r35	carry
+
+	.prologue
+.Lhere:
+{ .mmi;	add	r33 = -1, r33		C M0  size-1
+	mov	r14 = 2			C M1  2
+	mov	r15 = ip		C I0  .Lhere
+}{.mmi;	setf.sig f6 = r34		C M2  divisor
+	setf.sig f9 = r35		C M3  carry
+	zxt1	r3 = r34		C I1  divisor low byte
+}	;;
+
+{ .mmi;	add	r3 = .Ltable-.Lhere, r3	C M0  table offset ip and index
+	sub	r16 = 0, r34		C M1  -divisor
+	.save	ar.lc, r2
+	mov	r2 = ar.lc		C I0
+}{.mmi;	.body
+	setf.sig f13 = r14		C M2  2 in significand
+	mov	r17 = -1		C M3  -1
+ABI32(`	zxt4	r33 = r33')		C I1  size extend
+}	;;
+
+{ .mmi;	add	r3 = r3, r15		C M0  table entry address
+ABI32(` addp4	r32 = 0, r32')		C M1  src extend
+	mov	ar.lc = r33		C I0  size-1 loop count
+}{.mmi;	setf.sig f12 = r16		C M2  -divisor
+	setf.sig f8 = r17		C M3  -1
+}	;;
+
+{ .mmi;	ld1	r3 = [r3]		C M0  inverse, 8 bits
+	ldf8	f10 = [r32], 8		C M1  src[0]
+	cmp.ne	p6,p0 = 0, r33		C I0  test size!=1
+}	;;
+
+	C Wait for table load.
+	C Hope for an L1 hit of 1 cycles to ALU, but could be more.
+	setf.sig f7 = r3		C M2  inverse, 8 bits
+(p6)	ldf8	f11 = [r32], 8		C M1  src[1], if size!=1
+	;;
+
+	C 5 cycles
+
+	C f6	divisor
+	C f7	inverse, being calculated
+	C f8	-1, will be -inverse
+	C f9	carry
+	C f10	src[0]
+	C f11	src[1]
+	C f12	-divisor
+	C f13	2
+	C f14	scratch
+
+	xmpy.l	f14 = f13, f7		C 2*i
+	xmpy.l	f7 = f7, f7		C i*i
+	;;
+	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 16 bits
+	;;
+
+	xmpy.l	f14 = f13, f7		C 2*i
+	xmpy.l	f7 = f7, f7		C i*i
+	;;
+	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 32 bits
+	;;
+
+	xmpy.l	f14 = f13, f7		C 2*i
+	xmpy.l	f7 = f7, f7		C i*i
+	;;
+
+	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 64 bits
+	xma.l	f10 = f9, f8, f10	C sc = c * -1 + src[0]
+	;;
+ASSERT(p6, `
+	xmpy.l	f15 = f6, f7 ;;	C divisor*inverse
+	getf.sig r31 = f15 ;;
+	cmp.eq	p6,p0 = 1, r31	C should == 1
+')
+
+	xmpy.l	f10 = f10, f7		C q = sc * inverse
+	xmpy.l	f8 = f7, f8		C -inverse = inverse * -1
+	br.cloop.sptk.few.clr .Lentry	C main loop, if size > 1
+	;;
+
+	C size==1, finish up now
+	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
+	mov	ar.lc = r2		C I0
+	;;
+	getf.sig r8 = f9		C M2  return c
+	br.ret.sptk.many b0
+
+
+
+.Ltop:
+	C r2	saved ar.lc
+	C f6	divisor
+	C f7	inverse
+	C f8	-inverse
+	C f9	carry
+	C f10	src[i] * inverse
+	C f11	scratch src[i+1]
+
+	add	r16 = 160, r32
+	ldf8	f11 = [r32], 8		C src[i+1]
+	;;
+	C 2 cycles
+
+	lfetch	[r16]
+	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
+	;;
+	C 3 cycles
+
+.Lentry:
+	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
+	xmpy.l	f10 = f11, f7		C si = src[i] * inverse
+	br.cloop.sptk.few.clr .Ltop
+	;;
+
+
+
+	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
+	mov	ar.lc = r2		C I0
+	;;
+	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
+	;;
+	getf.sig r8 = f9		C M2  return c
+	br.ret.sptk.many b0
+
+EPILOGUE()
diff --git a/third_party/gmp/mpn/ia64/mul_1.asm b/third_party/gmp/mpn/ia64/mul_1.asm
new file mode 100644
index 0000000..21bf6d0
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mul_1.asm
@@ -0,0 +1,584 @@
+dnl  IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
+dnl  store the result in a second limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    4.0
+C Itanium 2:  2.0
+
+C TODO
+C  * Further optimize feed-in and wind-down code, both for speed and code size.
+C  * Handle low limb input and results specially, using a common stf8 in the
+C    epilogue.
+C  * Use 1 c/l carry propagation scheme in wind-down code.
+C  * Use extra pointer register for `up' to speed up feed-in loads.
+C  * Work out final differences with addmul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+define(`cy', `r36')	C for mpn_mul_1c
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mfi
+	adds		r15 = -1, n		C M I
+	mov		f9 = f0			C F
+	mov.i		r2 = ar.lc		C I0
+}
+{.mmi
+	ldf8		f7 = [up], 8		C M
+	nop.m		0			C M
+	and		r14 = 3, n		C M I
+	;;
+}
+.Lcommon:
+{.mii
+	setf.sig	f6 = vl			C M2 M3
+	shr.u		r31 = r15, 2		C I0
+	cmp.eq		p10, p0 = 0, r14	C M I
+}
+{.mii
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	nop.i		0			C I
+	;;
+}
+{.mii
+	cmp.ne		p6, p7 = r0, r0		C M I
+	mov.i		ar.lc = r31		C I0
+	cmp.ne		p8, p9 = r0, r0		C M I
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	mov		r20 = 0
+	br.cloop.dptk	.grt1			C B
+
+	xma.l		f39 = f7, f6, f9	C F
+	xma.hu		f43 = f7, f6, f9	C F
+	;;
+	getf.sig	r8 = f43		C M2
+	stf8		[rp] = f39		C M2 M3
+	mov.i		ar.lc = r2		C I0
+	br.ret.sptk.many b0			C B
+
+.grt1:
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f33 = [up], 8
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f7, f6, f9
+	xma.hu		f43 = f7, f6, f9
+	;;
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt5
+
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	stf8		[rp] = f39, 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	getf.sig	r16 = f38
+	br		.Lcj5
+
+.grt5:
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r17 = f39
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	;;
+	getf.sig	r18 = f36
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	;;
+	getf.sig	r19 = f37
+	xma.hu		f43 = f35, f6, f0
+	br		.LL01
+
+
+.Lb10:	ldf8		f35 = [up], 8
+	mov		r23 = 0
+	br.cloop.dptk	.grt2
+
+	xma.l		f38 = f7, f6, f9
+	xma.hu		f42 = f7, f6, f9
+	;;
+	stf8		[rp] = f38, 8
+	xma.l		f39 = f35, f6, f42
+	xma.hu		f43 = f35, f6, f42
+	;;
+	getf.sig	r8 = f43
+	stf8		[rp] = f39
+	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+
+
+.grt2:
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f7, f6, f9
+	xma.hu		f42 = f7, f6, f9
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt6
+
+	stf8		[rp] = f38, 8
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r20 = f42
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	br		.Lcj6
+
+.grt6:
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r20 = f42
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	;;
+	getf.sig	r17 = f39
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	;;
+	getf.sig	r18 = f36
+	xma.hu		f42 = f34, f6, f0
+	br		.LL10
+
+
+.Lb11:	ldf8		f34 = [up], 8
+	mov		r22 = 0
+	;;
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt3
+	;;
+
+	xma.l		f37 = f7, f6, f9
+	xma.hu		f41 = f7, f6, f9
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	stf8		[rp] = f37, 8
+	getf.sig	r16 = f38
+	getf.sig	r20 = f42
+	getf.sig	r17 = f39
+	getf.sig	r8 = f43
+	br		.Lcj3
+
+.grt3:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f7, f6, f9
+	xma.hu		f41 = f7, f6, f9
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r19 = f37
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt7
+
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	getf.sig	r20 = f42
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+	getf.sig	r21 = f43
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r18 = f36
+	st8		[rp] = r19, 8
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	br		.Lcj7
+
+.grt7:
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r20 = f42
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	;;
+	getf.sig	r17 = f39
+	xma.hu		f41 = f33, f6, f0
+	br		.LL11
+
+
+.Lb00:	ldf8		f33 = [up], 8
+	mov		r21 = 0
+	;;
+	ldf8		f34 = [up], 8
+	;;
+	ldf8		f35 = [up], 8
+	xma.l		f36 = f7, f6, f9
+	xma.hu		f40 = f7, f6, f9
+	br.cloop.dptk	.grt4
+
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	stf8		[rp] = f36, 8
+	xma.l		f39 = f35, f6, f0
+	getf.sig	r19 = f37
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	getf.sig	r16 = f38
+	getf.sig	r20 = f42
+	getf.sig	r17 = f39
+	br		.Lcj4
+
+.grt4:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r18 = f36
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	;;
+	getf.sig	r19 = f37
+	getf.sig	r23 = f41
+	xma.hu		f43 = f35, f6, f0
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt8
+
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	getf.sig	r20 = f42
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r17 = f39
+	st8		[rp] = r18, 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	br		.Lcj8
+
+.grt8:
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	br		.LL00
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Loop:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r17
+	st8		[rp] = r24, 8
+	xma.hu		f40 = f32, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	;;
+.LL00:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r20 = f42
+   (p8)	add		r24 = r18, r21, 1
+	nop.b		0
+	ldf8		f32 = [up], 8
+   (p9)	add		r24 = r18, r21
+	nop.b		0
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r18
+	st8		[rp] = r24, 8
+	xma.hu		f41 = f33, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r18
+	;;
+.LL11:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r21 = f43
+   (p6)	add		r24 = r19, r22, 1
+	nop.b		0
+	ldf8		f33 = [up], 8
+   (p7)	add		r24 = r19, r22
+	nop.b		0
+	;;
+	.pred.rel "mutex",p6,p7
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r19
+	st8		[rp] = r24, 8
+	xma.hu		f42 = f34, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r19
+	;;
+.LL10:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r22 = f40
+   (p8)	add		r24 = r16, r23, 1
+	nop.b		0
+	ldf8		f34 = [up], 8
+   (p9)	add		r24 = r16, r23
+	nop.b		0
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r16
+	st8		[rp] = r24, 8
+	xma.hu		f43 = f35, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r16
+	;;
+.LL01:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r23 = f41
+   (p6)	add		r24 = r17, r20, 1
+	nop.b		0
+	ldf8		f35 = [up], 8
+   (p7)	add		r24 = r17, r20
+	br.cloop.dptk	.Loop
+C *** MAIN LOOP END ***
+	;;
+
+.Lcj9:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r17
+	st8		[rp] = r24, 8
+	xma.hu		f40 = f32, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r20 = f42
+   (p8)	add		r24 = r18, r21, 1
+   (p9)	add		r24 = r18, r21
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r18
+	st8		[rp] = r24, 8
+	xma.hu		f41 = f33, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r18
+	;;
+.Lcj8:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r21 = f43
+   (p6)	add		r24 = r19, r22, 1
+   (p7)	add		r24 = r19, r22
+	;;
+	.pred.rel "mutex",p6,p7
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r19
+	st8		[rp] = r24, 8
+	xma.hu		f42 = f34, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r19
+	;;
+.Lcj7:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r22 = f40
+   (p8)	add		r24 = r16, r23, 1
+   (p9)	add		r24 = r16, r23
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r16
+	st8		[rp] = r24, 8
+	xma.hu		f43 = f35, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r16
+	;;
+.Lcj6:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r23 = f41
+   (p6)	add		r24 = r17, r20, 1
+   (p7)	add		r24 = r17, r20
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.leu		p8, p9 = r24, r17
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	getf.sig	r16 = f38
+	st8		[rp] = r24, 8
+	;;
+.Lcj5:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r20 = f42
+   (p8)	add		r24 = r18, r21, 1
+   (p9)	add		r24 = r18, r21
+	;;
+	.pred.rel "mutex",p8,p9
+   (p8)	cmp.leu		p6, p7 = r24, r18
+   (p9)	cmp.ltu		p6, p7 = r24, r18
+	getf.sig	r17 = f39
+	st8		[rp] = r24, 8
+	;;
+.Lcj4:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r8 = f43
+   (p6)	add		r24 = r19, r22, 1
+   (p7)	add		r24 = r19, r22
+	;;
+	.pred.rel "mutex",p6,p7
+	st8		[rp] = r24, 8
+   (p6)	cmp.leu		p8, p9 = r24, r19
+   (p7)	cmp.ltu		p8, p9 = r24, r19
+	;;
+.Lcj3:
+	.pred.rel "mutex",p8,p9
+   (p8)	add		r24 = r16, r23, 1
+   (p9)	add		r24 = r16, r23
+	;;
+	.pred.rel "mutex",p8,p9
+	st8		[rp] = r24, 8
+   (p8)	cmp.leu		p6, p7 = r24, r16
+   (p9)	cmp.ltu		p6, p7 = r24, r16
+	;;
+.Lcj2:
+	.pred.rel "mutex",p6,p7
+   (p6)	add		r24 = r17, r20, 1
+   (p7)	add		r24 = r17, r20
+	;;
+	.pred.rel "mutex",p6,p7
+	st8		[rp] = r24, 8
+   (p6)	cmp.leu		p8, p9 = r24, r17
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	;;
+   (p8)	add		r8 = 1, r8
+	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1c)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmi
+	adds		r15 = -1, n		C M I
+	setf.sig	f9 = cy			C M2 M3
+	mov.i		r2 = ar.lc		C I0
+}
+{.mmb
+	ldf8		f7 = [up], 8		C M
+	and		r14 = 3, n		C M I
+	br.sptk		.Lcommon
+	;;
+}
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/mul_2.asm b/third_party/gmp/mpn/ia64/mul_2.asm
new file mode 100644
index 0000000..5343f64
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mul_2.asm
@@ -0,0 +1,625 @@
+dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
+dnl  store the result to a (n+1)-limb number.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2004, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    ?
+C Itanium 2:  1.5
+
+C TODO
+C  * Clean up variable names, and try to decrease the number of distinct
+C    registers used.
+C  * Clean up feed-in code to not require zeroing several registers.
+C  * Make sure we don't depend on uninitialized predicate registers.
+C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
+C    wind-down code.
+C  * Ultimately rewrite.  The problem with this code is that it first uses a
+C    loaded u value in one xma pair, then leaves it live over several unrelated
+C    xma pairs, before it uses it again.  It should actually be quite possible
+C    to just swap some aligned xma pairs around.  But we should then schedule
+C    u loads further from the first use.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
+define(`vp',`r35')
+
+define(`srp',`r3')
+
+define(`v0',`f6')
+define(`v1',`f7')
+
+define(`s0',`r14')
+define(`acc0',`r15')
+
+define(`pr0_0',`r16') define(`pr0_1',`r17')
+define(`pr0_2',`r18') define(`pr0_3',`r19')
+
+define(`pr1_0',`r20') define(`pr1_1',`r21')
+define(`pr1_2',`r22') define(`pr1_3',`r23')
+
+define(`acc1_0',`r24') define(`acc1_1',`r25')
+define(`acc1_2',`r26') define(`acc1_3',`r27')
+
+dnl define(`',`r28')
+dnl define(`',`r29')
+dnl define(`',`r30')
+dnl define(`',`r31')
+
+define(`fp0b_0',`f8') define(`fp0b_1',`f9')
+define(`fp0b_2',`f10') define(`fp0b_3',`f11')
+
+define(`fp1a_0',`f12') define(`fp1a_1',`f13')
+define(`fp1a_2',`f14') define(`fp1a_3',`f15')
+
+define(`fp1b_0',`f32') define(`fp1b_1',`f33')
+define(`fp1b_2',`f34') define(`fp1b_3',`f35')
+
+define(`fp2a_0',`f36') define(`fp2a_1',`f37')
+define(`fp2a_2',`f38') define(`fp2a_3',`f39')
+
+define(`u_0',`f44') define(`u_1',`f45')
+define(`u_2',`f46') define(`u_3',`f47')
+
+define(`ux',`f49')
+define(`uy',`f51')
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi;		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		addp4	vp = 0, vp		C			M I
+}{.mmi;		nop	1
+		nop	1
+		zxt4	n = n			C			I
+	;;
+}')
+
+ {.mmi;		ldf8	ux = [up], 8		C			M
+		ldf8	v0 = [vp], 8		C			M
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		nop	1			C			M
+		and	r14 = 3, n		C			M I
+		add	n = -2, n		C			M I
+	;;
+}{.mmi;		ldf8	uy = [up], 8		C			M
+		ldf8	v1 = [vp]		C			M
+		shr.u	n = n, 2		C			I0
+}{.mmi;		nop	1			C			M
+		cmp.eq	p10, p0 = 1, r14	C			M I
+		cmp.eq	p11, p0 = 2, r14	C			M I
+	;;
+}{.mmi;		nop	1			C			M
+		cmp.eq	p12, p0 = 3, r14	C			M I
+		mov	ar.lc = n		C			I0
+}{.bbb;	(p10)	br.dptk	L(b01)			C			B
+	(p11)	br.dptk	L(b10)			C			B
+	(p12)	br.dptk	L(b11)			C			B
+	;;
+}
+	ALIGN(32)
+L(b00):		ldf8	u_1 = [up], 8
+		mov	acc1_2 = 0
+		mov	pr1_2 = 0
+		mov	pr0_3 = 0
+		cmp.ne	p8, p9 = r0, r0
+	;;
+		xma.l	fp0b_3 = ux, v0, f0
+		cmp.ne	p12, p13 = r0, r0
+		ldf8	u_2 = [up], 8
+		xma.hu	fp1a_3 = ux, v0, f0
+		br.cloop.dptk	L(gt4)
+
+		xma.l	fp0b_0 = uy, v0, f0
+		xma.hu	fp1a_0 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_3
+		xma.l	fp1b_3 = ux, v1, fp1a_3
+		xma.hu	fp2a_3 = ux, v1, fp1a_3
+	;;
+		xma.l	fp0b_1 = u_1, v0, f0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		getfsig	pr1_3 = fp1b_3
+		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, f0
+		xma.hu	fp1a_2 = u_2, v0, f0
+		br	L(cj4)
+
+L(gt4):		xma.l	fp0b_0 = uy, v0, f0
+		xma.hu	fp1a_0 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_3
+		xma.l	fp1b_3 = ux, v1, fp1a_3
+		ldf8	u_3 = [up], 8
+		xma.hu	fp2a_3 = ux, v1, fp1a_3
+	;;
+		xma.l	fp0b_1 = u_1, v0, f0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+		xma.l	fp0b_2 = u_2, v0, f0
+	;;
+		getfsig	acc1_3 = fp2a_3
+		xma.hu	fp1a_2 = u_2, v0, f0
+		br	L(00)
+
+
+	ALIGN(32)
+L(b01):		ldf8	u_0 = [up], 8		C M
+		mov	acc1_1 = 0		C M I
+		mov	pr1_1 = 0		C M I
+		mov	pr0_2 = 0		C M I
+		cmp.ne	p6, p7 = r0, r0		C M I
+	;;
+		xma.l	fp0b_2 = ux, v0, f0	C F
+		cmp.ne	p10, p11 = r0, r0	C M I
+		ldf8	u_1 = [up], 8		C M
+		xma.hu	fp1a_2 = ux, v0, f0	C F
+	;;
+		xma.l	fp0b_3 = uy, v0, f0	C F
+		xma.hu	fp1a_3 = uy, v0, f0	C F
+	;;
+		getfsig	acc0 = fp0b_2		C M
+		xma.l	fp1b_2 = ux, v1,fp1a_2	C F
+		ldf8	u_2 = [up], 8		C M
+		xma.hu	fp2a_2 = ux, v1,fp1a_2	C F
+		br.cloop.dptk	L(gt5)
+
+		xma.l	fp0b_0 = u_0, v0, f0	C F
+		xma.hu	fp1a_0 = u_0, v0, f0	C F
+	;;
+		getfsig	pr0_3 = fp0b_3		C M
+		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
+		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
+	;;
+		getfsig	pr1_2 = fp1b_2		C M
+		getfsig	acc1_2 = fp2a_2		C M
+		xma.l	fp0b_1 = u_1, v0, f0	C F
+		xma.hu	fp1a_1 = u_1, v0, f0	C F
+		br	L(cj5)
+
+L(gt5):		xma.l	fp0b_0 = u_0, v0, f0
+		xma.hu	fp1a_0 = u_0, v0, f0
+	;;
+		getfsig	pr0_3 = fp0b_3
+		xma.l	fp1b_3 = uy, v1, fp1a_3
+		xma.hu	fp2a_3 = uy, v1, fp1a_3
+	;;
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+		xma.l	fp0b_1 = u_1, v0, f0
+	;;
+		getfsig	acc1_2 = fp2a_2
+		xma.hu	fp1a_1 = u_1, v0, f0
+		br	L(01)
+
+
+	ALIGN(32)
+L(b10):		br.cloop.dptk	L(gt2)
+		xma.l	fp0b_1 = ux, v0, f0
+		xma.hu	fp1a_1 = ux, v0, f0
+	;;
+		xma.l	fp0b_2 = uy, v0, f0
+		xma.hu	fp1a_2 = uy, v0, f0
+	;;
+		stf8	[rp] = fp0b_1, 8
+		xma.l	fp1b_1 = ux, v1, fp1a_1
+		xma.hu	fp2a_1 = ux, v1, fp1a_1
+	;;
+		getfsig	acc0 = fp0b_2
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		getfsig	pr1_1 = fp1b_1
+		getfsig	acc1_1 = fp2a_1
+		mov	ar.lc = r2
+		getfsig	pr1_2 = fp1b_2
+		getfsig	r8 = fp2a_2
+	;;
+		add	s0 = pr1_1, acc0
+	;;
+		st8	[rp] = s0, 8
+		cmp.ltu	p8, p9 = s0, pr1_1
+		sub	r31 = -1, acc1_1
+	;;
+	.pred.rel "mutex", p8, p9
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	(p8)	cmp.leu	p10, p0 = r31, pr1_2
+	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
+	;;
+		st8	[rp] = acc0, 8
+	(p10)	add	r8 = 1, r8
+		br.ret.sptk.many b0
+
+L(gt2):		ldf8	u_3 = [up], 8
+		mov	acc1_0 = 0
+		mov	pr1_0 = 0
+	;;
+		mov	pr0_1 = 0
+		xma.l	fp0b_1 = ux, v0, f0
+		ldf8	u_0 = [up], 8
+		xma.hu	fp1a_1 = ux, v0, f0
+	;;
+		xma.l	fp0b_2 = uy, v0, f0
+		xma.hu	fp1a_2 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_1
+		xma.l	fp1b_1 = ux, v1, fp1a_1
+		xma.hu	fp2a_1 = ux, v1, fp1a_1
+	;;
+		ldf8	u_1 = [up], 8
+		xma.l	fp0b_3 = u_3, v0, f0
+		xma.hu	fp1a_3 = u_3, v0, f0
+	;;
+		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	;;
+ {.mfi;		getfsig	acc1_1 = fp2a_1
+		xma.l	fp0b_0 = u_0, v0, f0
+		cmp.ne	p8, p9 = r0, r0
+}{.mfb;		cmp.ne	p12, p13 = r0, r0
+		xma.hu	fp1a_0 = u_0, v0, f0
+		br	L(10)
+}
+
+	ALIGN(32)
+L(b11):		mov	acc1_3 = 0
+		mov	pr1_3 = 0
+		mov	pr0_0 = 0
+		ldf8	u_2 = [up], 8
+		cmp.ne	p6, p7 = r0, r0
+		br.cloop.dptk	L(gt3)
+	;;
+		xma.l	fp0b_0 = ux, v0, f0
+		xma.hu	fp1a_0 = ux, v0, f0
+	;;
+		cmp.ne	p10, p11 = r0, r0
+		xma.l	fp0b_1 = uy, v0, f0
+		xma.hu	fp1a_1 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_0
+		xma.l	fp1b_0 = ux, v1, fp1a_0
+		xma.hu	fp2a_0 = ux, v1, fp1a_0
+	;;
+		xma.l	fp0b_2 = u_2, v0, f0
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = uy, v1, fp1a_1
+		xma.hu	fp2a_1 = uy, v1, fp1a_1
+	;;
+		getfsig	pr1_0 = fp1b_0
+		getfsig	acc1_0 = fp2a_0
+		br	L(cj3)
+
+L(gt3):		xma.l	fp0b_0 = ux, v0, f0
+		cmp.ne	p10, p11 = r0, r0
+		ldf8	u_3 = [up], 8
+		xma.hu	fp1a_0 = ux, v0, f0
+	;;
+		xma.l	fp0b_1 = uy, v0, f0
+		xma.hu	fp1a_1 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_0
+		xma.l	fp1b_0 = ux, v1, fp1a_0
+		ldf8	u_0 = [up], 8
+		xma.hu	fp2a_0 = ux, v1, fp1a_0
+	;;
+		xma.l	fp0b_2 = u_2, v0, f0
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = uy, v1, fp1a_1
+		xma.hu	fp2a_1 = uy, v1, fp1a_1
+	;;
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	;;
+		getfsig	acc1_0 = fp2a_0
+		xma.l	fp0b_3 = u_3, v0, f0
+		xma.hu	fp1a_3 = u_3, v0, f0
+		br	L(11)
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):						C 00
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;					C 01
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_2 = fp2a_2
+		st8	[rp] = s0, 8
+		xma.l	fp0b_1 = u_1, v0, f0
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;					C 02
+L(01):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+		nop	1
+	;;					C 03
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;					C 04
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_3 = fp2a_3
+		st8	[rp] = s0, 8
+		xma.l	fp0b_2 = u_2, v0, f0
+	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;					C 05
+L(00):
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+		nop	1
+	;;					C 06
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;					C 07
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_0 = fp2a_0
+		st8	[rp] = s0, 8
+		xma.l	fp0b_3 = u_3, v0, f0
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+		xma.hu	fp1a_3 = u_3, v0, f0
+	;;					C 08
+L(11):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+		nop	1
+	;;					C 09
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;					C 10
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_1 = fp2a_1
+		st8	[rp] = s0, 8
+		xma.l	fp0b_0 = u_0, v0, f0
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+		xma.hu	fp1a_0 = u_0, v0, f0
+	;;					C 11
+L(10):
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_3 = fp0b_3
+		xma.l	fp1b_3 = u_3, v1, fp1a_3
+	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		xma.hu	fp2a_3 = u_3, v1, fp1a_3
+		br.cloop.dptk	L(top)
+	;;
+C *** MAIN LOOP END ***
+
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mfi;		getfsig	acc1_2 = fp2a_2
+		xma.l	fp0b_1 = u_1, v0, f0
+		nop	1
+}{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;
+}
+L(cj5):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_3 = fp1b_3
+		st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mfi;		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, f0
+		nop	1
+}{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;
+}
+L(cj4):
+	.pred.rel "mutex", p12, p13
+ {.mfi;		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+}{.mfi;	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_0 = fp1b_0
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	acc1_0 = fp2a_0
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+	;;
+}
+L(cj3):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_1 = fp1b_1
+		st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		getfsig	acc1_1 = fp2a_1
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+	;;
+}	.pred.rel "mutex", p12, p13
+ {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	r8 = fp2a_2
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		st8	[rp] = s0, 8
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
+	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
+	(p12)	add	acc0 = 1, acc0
+	;;
+}{.mmi;		st8	[rp] = acc0, 8
+	(p12)	cmpeqor	p10, p0 = 0, acc0
+		nop	1
+	;;
+}{.mib;	(p10)	add	r8 = 1, r8
+		mov	ar.lc = r2
+		br.ret.sptk.many b0
+}
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/popcount.asm b/third_party/gmp/mpn/ia64/popcount.asm
new file mode 100644
index 0000000..c0b5c5c
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/popcount.asm
@@ -0,0 +1,200 @@
+dnl  IA-64 mpn_popcount -- mpn population count.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       1.5
+C Itanium 2:     1
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`n', `r33')
+
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
+define(`s',`r8')
+
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+	.prologue
+ifdef(`HAVE_ABI_32',
+`	addp4		up = 0, up		C			M I
+	nop.m		0
+	zxt4		n = n			C			I
+	;;
+')
+
+ {.mmi;	add		r9 = 512, up		C prefetch pointer	M I
+	ld8		r10 = [up], 8		C load first limb	M01
+	mov.i		r2 = ar.lc		C save ar.lc		I0
+}{.mmi;	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p14 = 4, n		C small count?		M I
+	add		n = -5, n		C			M I
+	;;
+}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk		.Lb01			C			B
+  (p7)	br.dptk		.Lb10			C			B
+  (p8)	br.dptk		.Lb11			C			B
+}
+
+
+.Lb00:	ld8		u1 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = r10		C			I0
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u3 = [up], 8		C			M01
+	popcnt		c1 = u1			C			I0
+  (p15)	br.cond.dptk	.grt4			C			B
+	;;
+	nop.m	0				C			-
+	nop.m	0				C			-
+	popcnt		c2 = u2			C			I0
+	;;
+	mov		s = c0			C			M I
+	popcnt		c3 = u3			C			I0
+	br		.Lcj4			C			B
+
+.grt4:	ld8		u0 = [up], 8		C			M01
+	popcnt		c2 = u2			C			I0
+	br		.LL00			C			B
+
+
+.Lb01:
+	popcnt		s = r10			C			I0
+  (p14)	br.ret.sptk.many b0			C			B
+
+.grt1:	ld8		u0 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = u0			C			I0
+	mov		c3 = 0			C			I0
+
+	;;
+	ld8		u3 = [up], 8		C			M01
+	popcnt		c1 = u1			C			I0
+	br.cloop.dptk	.Loop			C			B
+	br		.Lend			C			B
+
+
+.Lb10:	ld8		u3 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+  (p15)	br.cond.dptk	.grt2			C			B
+
+	popcnt		s = r10			C			I0
+	;;
+	popcnt		c3 = u3			C			I0
+	br		.Lcj2			C			B
+
+.grt2:	ld8		u0 = [up], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	popcnt		c2 = r10		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	popcnt		c3 = u3			C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = u0			C			I0
+	br		.LL10			C			B
+
+
+.Lb11:	ld8		u2 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	popcnt		s = r10			C			I0
+  (p15)	br.cond.dptk	.grt3			C			B
+
+	popcnt		c2 = u2			C			I0
+	;;
+	popcnt		c3 = u3			C			I0
+	br		.Lcj3			C			B
+
+.grt3:	ld8		u0 = [up], 8		C			M01
+	popcnt		c2 = u2			C			I0
+	mov.i		ar.lc = n		C			I0
+	mov		c1 = 0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	popcnt		c3 = u3			C			I0
+	br		.LL11			C			B
+
+
+.Loop:	ld8		u0 = [up], 8		C			M01
+	popcnt		c2 = u2			C			I0
+	add		s = s, c3		C			M I
+	;;
+.LL00:	ld8		u1 = [up], 8		C			M01
+	popcnt		c3 = u3			C			I0
+	add		s = s, c0		C			M I
+	;;
+.LL11:	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = u0			C			I0
+	add		s = s, c1		C			M I
+	;;
+.LL10:	ld8		u3 = [up], 8		C			M01
+	popcnt		c1 = u1			C			I0
+	add		s = s, c2		C			M I
+	lfetch		[r9], 32		C			M01
+	nop.m		0			C			-
+	br.cloop.dptk	.Loop			C			B
+	;;
+
+.Lend:	popcnt		c2 = u2			C			I0
+	add		s = s, c3		C			M I
+	;;
+	popcnt		c3 = u3			C			I0
+	add		s = s, c0		C			M I
+	;;
+.Lcj4:	add		s = s, c1		C			M I
+	;;
+.Lcj3:	add		s = s, c2		C			M I
+	;;
+.Lcj2:	add		s = s, c3		C			M I
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()
+ASM_END()
diff --git a/third_party/gmp/mpn/ia64/rsh1aors_n.asm b/third_party/gmp/mpn/ia64/rsh1aors_n.asm
new file mode 100644
index 0000000..3c7defb
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/rsh1aors_n.asm
@@ -0,0 +1,447 @@
+dnl  IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    2.5
+C Itanium 2:  1.5
+
+C TODO
+C  * Rewrite function entry code using aorslsh1_n.asm style.
+C  * Micro-optimize feed-in and wind-down code.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`vp',`r34')
+define(`n',`r35')
+
+ifdef(`OPERATION_rsh1add_n',`
+  define(ADDSUB,       add)
+  define(PRED,	       ltu)
+  define(INCR,	       1)
+  define(LIM,	       -1)
+  define(func, mpn_rsh1add_n)
+')
+ifdef(`OPERATION_rsh1sub_n',`
+  define(ADDSUB,       sub)
+  define(PRED,	       gtu)
+  define(INCR,	       -1)
+  define(LIM,	       0)
+  define(func, mpn_rsh1sub_n)
+')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
+define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
+define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4		rp = 0, rp		C			M I
+	addp4		up = 0, up		C			M I
+	addp4		vp = 0, vp		C			M I
+	nop.m		0
+	nop.m		0
+	zxt4		n = n			C			I
+	;;
+')
+ {.mmi;	ld8		r11 = [vp], 8		C			M01
+	ld8		r10 = [up], 8		C			M01
+	mov.i		r2 = ar.lc		C			I0
+}{.mmi;	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p0 = 4, n		C			M I
+	add		n = -4, n		C			M I
+	;;
+}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk		.Lb01			C			B
+  (p7)	br.dptk		.Lb10			C			B
+  (p8)	br.dptk		.Lb11			C			B
+}
+
+.Lb00:	ld8		v0 = [vp], 8		C			M01
+	ld8		u0 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	ld8		u1 = [up], 8		C			M01
+	ADDSUB		w3 = r10, r11		C			M I
+	;;
+	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+  (p15)	br.dpnt		.grt4			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, r10	C			M I
+	and		r8 = 1, w3		C			M I
+	ADDSUB		w0 = u0, v0		C			M I
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+	shrp		x3 = w0, w3, 1		C			I0
+	ADDSUB		w2 = u2, v2		C			M I
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	br		.Lcj4			C			B
+
+.grt4:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, r10	C			M I
+	ld8		u3 = [up], 8		C			M01
+	and		r8 = 1, w3		C			M I
+	;;
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+	add		n = -1, n
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+	shrp		x3 = w0, w3, 1		C			I0
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	br		.LL00			C			B
+
+
+.Lb01:	ADDSUB		w2 = r10, r11		C			M I
+	shr.u		n = n, 2		C			I0
+  (p15)	br.dpnt		.grt1			C			B
+	;;
+
+	cmp.PRED	p6, p7 = w2, r10	C			M I
+	shr.u		x2 = w2, 1		C			I0
+	and		r8 = 1, w2		C			M I
+	;;
+   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
+	br		.Lcj1			C			B
+
+.grt1:	ld8		v3 = [vp], 8		C			M01
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ld8		v0 = [vp], 8		C			M01
+	ld8		u0 = [up], 8		C			M01
+	mov.i		ar.lc = n		C FIXME swap with next	I0
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	ld8		u1 = [up], 8		C			M01
+	;;
+	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+	cmp.PRED	p6, p0 = w2, r10	C			M I
+	and		r8 = 1, w2		C			M I
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.grt5			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	;;
+	ADDSUB		w0 = u0, v0		C			M I
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	br		.Lcj5			C			B
+
+.grt5:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	br		.LL01			C			B
+
+
+.Lb10:	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	ADDSUB		w1 = r10, r11		C			M I
+  (p15)	br.dpnt		.grt2			C			B
+	;;
+
+	cmp.PRED	p9, p0 = w1, r10	C			M I
+	and		r8 = 1, w1		C			M I
+	ADDSUB		w2 = u2, v2		C			M I
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	;;
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+	shrp		x1 = w2, w1, 1		C			I0
+	shr.u		x2 = w2, 1		C			I0
+	br		.Lcj2			C			B
+
+.grt2:	ld8		v3 = [vp], 8		C			M01
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ld8		v0 = [vp], 8		C			M01
+	ld8		u0 = [up], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, r10	C			M I
+	ld8		u1 = [up], 8		C			M01
+	and		r8 = 1, w1		C			M I
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.grt6			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	br		.Lcj6			C			B
+
+.grt6:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	br		.LL10			C			B
+
+
+.Lb11:	ld8		v1 = [vp], 8		C			M01
+	ld8		u1 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w0 = r10, r11		C			M I
+  (p15)	br.dpnt		.grt3			C			B
+	;;
+
+	cmp.PRED	p8, p0 = w0, r10	C			M I
+	ADDSUB		w1 = u1, v1		C			M I
+	and		r8 = 1, w0		C			M I
+	;;
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	;;
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	br		.Lcj3			C			B
+
+.grt3:	ld8		v3 = [vp], 8		C			M01
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ld8		v0 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	cmp.PRED	p8, p0 = w0, r10	C			M I
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	and		r8 = 1, w0		C			M I
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.grt7			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	br		.Lcj7			C			B
+
+.grt7:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	br		.LL11			C			B
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Loop:	st8		[rp] = x3, 8		C			M23
+	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+.LL11:	st8		[rp] = x0, 8		C			M23
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+.LL10:	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	nop.b		0
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	nop.b		0
+	;;
+	st8		[rp] = x1, 8		C			M23
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+.LL01:	st8		[rp] = x2, 8		C			M23
+	shrp		x3 = w0, w3, 1		C			I0
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+.LL00:	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	nop.b		0
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.Loop			C			B
+	;;
+C *** MAIN LOOP END ***
+
+.Lskip:	st8		[rp] = x3, 8		C			M23
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+.Lcj7:	st8		[rp] = x0, 8		C			M23
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+.Lcj6:	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	st8		[rp] = x1, 8		C			M23
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+.Lcj5:	st8		[rp] = x2, 8		C			M23
+	shrp		x3 = w0, w3, 1		C			I0
+	ADDSUB		w2 = u2, v2		C			M I
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+.Lcj4:	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	;;
+	st8		[rp] = x3, 8		C			M23
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+.Lcj3:	st8		[rp] = x0, 8		C			M23
+	shrp		x1 = w2, w1, 1		C			I0
+	shr.u		x2 = w2, 1		C			I0
+	;;
+.Lcj2:	st8		[rp] = x1, 8		C			M23
+   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
+	;;
+.Lcj1:	st8		[rp] = x2		C			M23
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()
diff --git a/third_party/gmp/mpn/ia64/sec_tabselect.asm b/third_party/gmp/mpn/ia64/sec_tabselect.asm
new file mode 100644
index 0000000..9b11cde
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/sec_tabselect.asm
@@ -0,0 +1,148 @@
+dnl  IA-64 mpn_sec_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       ?
+C Itanium 2:     2.5
+
+C NOTES
+C  * Using software pipelining could trivially yield 2 c/l without unrolling,
+C    or 1+epsilon with unrolling.  (This code was modelled after the powerpc64
+C    code, for simplicity.)
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r32')
+define(`tp',     `r33')
+define(`n',      `r34')
+define(`nents',  `r35')
+define(`which',  `r36')
+
+define(`mask',   `r8')
+
+define(`rp1',     `r32')
+define(`tp1',     `r33')
+define(`rp2',     `r14')
+define(`tp2',     `r15')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+ {.mmi;	addp4	rp = 0, rp		C			M I
+	addp4	tp = 0, tp		C			M I
+	zxt4	n = n			C			I
+}{.mii;	nop	0
+	zxt4	nents = nents		C			I
+	zxt4	which = which		C			I
+	;;
+}')
+ {.mmi;	add	rp2 = 8, rp1
+	add	tp2 = 8, tp1
+	add	r6 = -2, n
+	;;
+}{.mmi;	cmp.eq	p10, p0 = 1, n
+	and	r9 = 1, n		C set cr0 for use in inner loop
+	shr.u	r6 = r6, 1		C inner loop count
+	;;
+}{.mmi;	cmp.eq	p8, p0 = 0, r9
+	sub	which = nents, which
+	shl	n = n, 3
+	;;
+}
+L(outer):
+ {.mmi;	cmp.eq	p6, p7 = which, nents	C are we at the selected table entry?
+	nop	0
+	mov	ar.lc = r6		C			I0
+	;;
+}{.mmb;
+  (p6)	mov	mask = -1
+  (p7)	mov	mask = 0
+  (p8)	br.dptk	L(top)			C branch to loop entry if n even
+	;;
+}{.mmi;	ld8	r16 = [tp1], 8
+	add	tp2 = 8, tp2
+	nop	0
+	;;
+}{.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+	;;
+}{.mmi;	andcm	r18 = r18, mask
+	;;
+	or	r16 = r16, r18
+	nop	0
+	;;
+}{.mmb;	st8	[rp1] = r16, 8
+	add	rp2 = 8, rp2
+  (p10)	br.dpnt	L(end)
+}
+	ALIGN(32)
+L(top):
+ {.mmi;	ld8	r16 = [tp1], 16
+	ld8	r17 = [tp2], 16
+	nop	0
+	;;
+}{.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+}{.mmi;	ld8	r19 = [rp2]
+	and	r17 = r17, mask
+	nop	0
+	;;
+}{.mmi;	andcm	r18 = r18, mask
+	andcm	r19 = r19, mask
+	nop	0
+	;;
+}{.mmi;	or	r16 = r16, r18
+	or	r17 = r17, r19
+	nop	0
+	;;
+}{.mmb;	st8	[rp1] = r16, 16
+	st8	[rp2] = r17, 16
+	br.cloop.dptk	L(top)
+	;;
+}
+L(end):
+ {.mmi;	sub	rp1 = rp1, n		C move rp back to beginning
+	sub	rp2 = rp2, n		C move rp back to beginning
+	cmp.ne	p9, p0 = 1, nents
+}{.mmb;	add	nents = -1, nents
+	nop	0
+  (p9)	br.dptk	L(outer)
+	;;
+}{.mib;	nop	0
+	nop	0
+	br.ret.sptk.many b0
+}
+EPILOGUE()
diff --git a/third_party/gmp/mpn/ia64/sqr_diag_addlsh1.asm b/third_party/gmp/mpn/ia64/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..727f489
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/sqr_diag_addlsh1.asm
@@ -0,0 +1,156 @@
+dnl  IA-64 mpn_sqr_diag_addlsh1
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    2	Unrolling could bring it to 1.5 + epsilon
+
+C Exact performance table.  The 2nd line is this code, the 3rd line is ctop-
+C less code.  In an assembly sqr_basecase, the ctop-full numbers will become a
+C few cycles better since we can mitigate the many I0 instructions.
+C
+C 1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
+C -  20  22  24  26  28  30  32  34  36  38  40  42  44  46  48  50  52  54  56 Needs updating
+C -  13  16  17  18  20  21  23  25  26  30  31  31  33  34  36  38  39  42  43
+
+C We should keep in mind that this code takes linear time in a O(n^2) context
+C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
+C around 60.  Keeping overhead down for smallish operands (< 10) is more
+C important than optimal cycle counts.
+
+C TODO
+C  * Make sure we don't depend on uninitialised r-registers, f-registers, or
+C  * p-registers.
+C  * Optimise by doing first two loop iterations in function header.
+
+C INPUT PARAMETERS
+define(`rp_param', `r32')  define(`rp', `r14')		C size: 2n
+define(`tp_param', `r33')  define(`tp', `r15')		C size: 2n - 2
+define(`up_param', `r34')  define(`up', `r31')		C size: n
+define(`n',  `r35')
+
+ifdef(`HAVE_ABI_32',`
+	define(`ABI64', `')
+	define(`ABI32', `$1')
+',`
+	define(`ABI64', `$1')
+	define(`ABI32', `')
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+
+	.prologue
+	.save	ar.pfs, r2
+	.save	ar.lc, r3
+	.body
+
+ {.mii;		alloc	r2 = ar.pfs, 4,24,0,24	C			M
+		mov	r3 = ar.lc		C			I0
+	ABI64(`	nop	4711		')
+	ABI32(`	zxt4	n = n		')
+}{.mmi;	ABI64(`	mov	tp = tp_param	')	C			M I
+	ABI32(`	addp4	tp = 0, tp_param')	C			M I
+	ABI64(`	mov	up = up_param	')	C			M I
+	ABI32(`	addp4	up = 0, up_param')	C			M I
+	ABI64(`	mov	rp = rp_param	')	C			M I
+	ABI32(`	addp4	rp = 0, rp_param')	C			M I
+	;;
+}{.mmi;		ld8	r36 = [tp], 8		C			M
+		add	r20 = -2, n		C			M I
+		mov	r9 = ar.ec		C			I0
+	;;
+}{.mmi;		ld8	r32 = [tp], 8		C			M
+		mov	r16 = 0			C			M I
+		mov	ar.ec = 7		C			I0
+	;;
+}{.mmi;		nop	4711
+		mov	r44 = 0			C			M I
+		mov	ar.lc = r20		C			I0
+	;;
+}{.mii;		mov	r33 = 0
+		mov	r10 = pr		C			I0
+		mov	pr.rot = 0x30000	C			I0
+	;;
+}		br.cexit.spnt.few.clr	L(end)
+
+dnl *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):
+ {.mfi;	(p18)	ldf8	f33 = [up], 8		C			M
+	(p20)	xma.l	f36 = f35, f35, f42	C			F
+	(p41)	cmpequc	p50, p0 = -1, r44	C			M I
+}{.mfi;		setfsig	f40 = r16		C			M23
+	(p20)	xma.hu	f38 = f35, f35, f42	C			F
+	(p23)	add	r50 = r41, r49		C			M I
+	;;
+}{.mmi;	(p16)	ld8	r36 = [tp], 8		C			M
+	(p23)	cmpltu	p40, p0 = r50, r41	C cyout hi		M I
+	(p19)	shrp	r45 = r38, r35, 63	C non-critical		I0
+}{.mmi;	(p21)	getfsig	r39 = f39		C hi			M2
+	(p24)	st8	[rp] = r51, 8		C hi			M23
+	(p41)	add	r44 = 1, r44		C			M I
+	;;
+}{.mmi;	(p16)	ld8	r32 = [tp], 8		C			M
+	(p50)	cmpeqor	p40, p0 = -1, r50	C cyout hi		M I
+	(p17)	shrp	r16 = r33, r37, 63	C critical		I0
+}{.mmi;	(p21)	getfsig	r42 = f37		C lo			M2
+	(p23)	st8	[rp] = r44, 8		C lo			M23
+	(p50)	add	r50 = 1, r50		C			M I
+	;;
+}		br.ctop.sptk.few.clr L(top)	C			B
+dnl *** MAIN LOOP END ***
+	;;
+L(end):
+ {.mmi;		nop	4711
+	(p41)	add	r44 = 1, r44		C			M I
+		shr.u	r48 = r39, 63		C			I0
+	;;
+}{.mmi;		st8	[rp] = r51, 8		C			M23
+	(p41)	cmpequc	p6, p0 = 0, r44		C			M I
+		add	r50 = r41, r48		C			M I
+	;;
+}{.mmi;		st8	[rp] = r44, 8		C			M23
+	(p6)	add	r50 = 1, r50		C			M I
+		mov	ar.lc = r3		C			I0
+	;;
+}{.mii;		st8	[rp] = r50		C			M23
+		mov	ar.ec = r9		C			I0
+		mov	pr = r10		C			I0
+	;;
+}{.mib;		nop	4711
+		mov	ar.pfs = r2		C			I0
+		br.ret.sptk.many b0		C			B
+}
+EPILOGUE()
diff --git a/third_party/gmp/mpn/ia64/submul_1.asm b/third_party/gmp/mpn/ia64/submul_1.asm
new file mode 100644
index 0000000..cb2a552
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/submul_1.asm
@@ -0,0 +1,647 @@
+dnl  IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl  result from a second limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    4.0
+C Itanium 2:  2.25 (alignment dependent, sometimes it seems to need 3 c/l)
+
+C TODO
+C  * Optimize feed-in and wind-down code, both for speed and code size.
+C  * Handle low limb input and results specially, using a common stf8 in the
+C    epilogue.
+C  * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
+C    2nd bundle.  This will allow the bbb bundle to be one cycle earlier and
+C    save a cycle.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmi
+	mov		r10 = rp		C M I
+	mov		r9 = up			C M I
+	sub		vl = r0, vl		C M I	negate vl
+}
+{.mmi
+	ldf8		f8 = [rp], 8		C M
+	ldf8		f7 = [up], 8		C M
+	add		r19 = -1, n		C M I	n - 1
+	;;
+}
+{.mmi
+	cmp.eq		p6, p0 = 0, vl		C M I
+	mov		r8 = 0			C M I	zero cylimb
+	mov		r2 = ar.lc		C I0
+}
+{.mmi
+	setf.sig	f6 = vl			C M2 M3
+	and		r14 = 3, n		C M I
+	shr.u		r19 = r19, 2		C I0
+	;;
+}
+{.mmb
+	nop		0
+	cmp.eq		p10, p0 = 0, r14	C M I
+   (p6)	br.spnt		.Ldone			C B	vl == 0
+}
+{.mmi
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	mov		ar.lc = r19		C I0
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	br.cloop.dptk	.grt1
+
+	xma.l		f39 = f7, f6, f8
+	xma.hu		f43 = f7, f6, f8
+	;;
+	getf.sig	r27 = f39			C lo
+	getf.sig	r31 = f43			C hi
+	ld8		r20 = [r9], 8
+	br		.Lcj1
+
+.grt1:	ldf8		f44 = [rp], 8
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	;;
+	ldf8		f46 = [rp], 8
+	xma.l		f39 = f7, f6, f8
+	ldf8		f34 = [up], 8
+	xma.hu		f43 = f7, f6, f8
+	;;
+	ldf8		f47 = [rp], 8
+	xma.l		f36 = f32, f6, f44
+	ldf8		f35 = [up], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt5
+	;;
+
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43			C hi
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40			C hi
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41			C hi
+	getf.sig	r26 = f38			C lo
+	ld8		r23 = [r9], 8
+	br		.Lcj5
+
+.grt5:	ldf8		f44 = [rp], 8
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.Loop
+	br		.Lend
+
+
+.Lb10:	ldf8		f47 = [rp], 8
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt2
+
+	xma.l		f38 = f7, f6, f8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r26 = f38			C lo
+	getf.sig	r30 = f42			C hi
+	ld8		r23 = [r9], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	getf.sig	r31 = f43			C hi
+	ld8		r20 = [r9], 8
+	br		.Lcj2
+
+.grt2:	ldf8		f44 = [rp], 8
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f7, f6, f8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt6
+
+	getf.sig	r30 = f42			C hi
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43			C hi
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40			C hi
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	br		.Lcj6
+
+.grt6:	ldf8		f44 = [rp], 8
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	br		.LL10
+
+
+.Lb11:	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	;;
+	ldf8		f47 = [rp], 8
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt3
+
+	xma.l		f37 = f7, f6, f8
+	xma.hu		f41 = f7, f6, f8
+	;;
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41			C hi
+	ld8		r22 = [r9], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	getf.sig	r30 = f42			C hi
+	ld8		r23 = [r9], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	getf.sig	r31 = f43			C hi
+	ld8		r20 = [r9], 8
+	br		.Lcj3
+
+.grt3:	ldf8		f44 = [rp], 8
+	xma.l		f37 = f7, f6, f8
+	ldf8		f32 = [up], 8
+	xma.hu		f41 = f7, f6, f8
+	;;
+	ldf8		f45 = [rp], 8
+	xma.l		f38 = f34, f6, f46
+	ldf8		f33 = [up], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt7
+	;;
+
+	getf.sig	r30 = f42			C hi
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43			C hi
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	br		.Lcj7
+
+.grt7:	ldf8		f44 = [rp], 8
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	br		.LL11
+
+
+.Lb00:	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	;;
+	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	;;
+	ldf8		f47 = [rp], 8
+	xma.l		f36 = f7, f6, f8
+	ldf8		f35 = [up], 8
+	xma.hu		f40 = f7, f6, f8
+	br.cloop.dptk	.grt4
+
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40			C hi
+	xma.l		f39 = f35, f6, f47
+	getf.sig	r25 = f37			C lo
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41			C hi
+	getf.sig	r26 = f38			C lo
+	ld8		r23 = [r9], 8
+	;;
+	getf.sig	r30 = f42			C hi
+	getf.sig	r27 = f39			C lo
+	ld8		r20 = [r9], 8
+	br		.Lcj4
+
+.grt4:	ldf8		f44 = [rp], 8
+	xma.l		f37 = f33, f6, f45
+	ldf8		f32 = [up], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f46
+	getf.sig	r24 = f36			C lo
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	getf.sig	r25 = f37			C lo
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt8
+	;;
+
+	getf.sig	r30 = f42			C hi
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	br		.Lcj8
+
+.grt8:	ldf8		f44 = [rp], 8
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	br		.LL00
+
+	ALIGN(32)
+.Loop:
+{.mmi
+	ldf8		f44 = [rp], 8
+	cmp.ltu		p6, p0 = r27, r8	C lo cmp
+	sub		r14 = r27, r8		C lo sub
+}
+{.mmi
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	sub		r8 = r20, r31		C hi sub
+	;;				C 01
+}
+{.mmf
+	getf.sig	r27 = f39			C lo
+	st8		[r10] = r14, 8
+	xma.l		f37 = f33, f6, f45
+}
+{.mfi
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+   (p6)	add		r8 = 1, r8
+	;;				C 02
+}
+{.mmi
+.LL00:	ldf8		f45 = [rp], 8
+	cmp.ltu		p6, p0 = r24, r8
+	sub		r14 = r24, r8
+}
+{.mmi
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	sub		r8 = r21, r28
+	;;				C 03
+}
+{.mmf
+	getf.sig	r24 = f36			C lo
+	st8		[r10] = r14, 8
+	xma.l		f38 = f34, f6, f46
+}
+{.mfi
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+   (p6)	add		r8 = 1, r8
+	;;				C 04
+}
+{.mmi
+.LL11:	ldf8		f46 = [rp], 8
+	cmp.ltu		p6, p0 = r25, r8
+	sub		r14 = r25, r8
+}
+{.mmi
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	sub		r8 = r22, r29
+	;;				C 05
+}
+{.mmf
+	getf.sig	r25 = f37			C lo
+	st8		[r10] = r14, 8
+	xma.l		f39 = f35, f6, f47
+}
+{.mfi
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+   (p6)	add		r8 = 1, r8
+	;;				C 06
+}
+{.mmi
+.LL10:	ldf8		f47 = [rp], 8
+	cmp.ltu		p6, p0 = r26, r8
+	sub		r14 = r26, r8
+}
+{.mmi
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	sub		r8 = r23, r30
+	;;				C 07
+}
+{.mmf
+	getf.sig	r26 = f38			C lo
+	st8		[r10] = r14, 8
+	xma.l		f36 = f32, f6, f44
+}
+{.mfi
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+   (p6)	add		r8 = 1, r8
+}
+	br.cloop.dptk	.Loop
+	;;
+
+.Lend:
+	cmp.ltu		p6, p0 = r27, r8
+	sub		r14 = r27, r8
+	getf.sig	r30 = f42
+	sub		r8 = r20, r31
+	;;
+	getf.sig	r27 = f39
+	st8		[r10] = r14, 8
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj8:
+	cmp.ltu		p6, p0 = r24, r8
+	sub		r14 = r24, r8
+	getf.sig	r31 = f43
+	sub		r8 = r21, r28
+	;;
+	getf.sig	r24 = f36
+	st8		[r10] = r14, 8
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj7:
+	cmp.ltu		p6, p0 = r25, r8
+	sub		r14 = r25, r8
+	getf.sig	r28 = f40
+	sub		r8 = r22, r29
+	;;
+	getf.sig	r25 = f37
+	st8		[r10] = r14, 8
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj6:
+	cmp.ltu		p6, p0 = r26, r8
+	sub		r14 = r26, r8
+	getf.sig	r29 = f41
+	sub		r8 = r23, r30
+	;;
+	getf.sig	r26 = f38
+	st8		[r10] = r14, 8
+	ld8		r23 = [r9], 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj5:
+	cmp.ltu		p6, p0 = r27, r8
+	sub		r14 = r27, r8
+	getf.sig	r30 = f42
+	sub		r8 = r20, r31
+	;;
+	getf.sig	r27 = f39
+	st8		[r10] = r14, 8
+	ld8		r20 = [r9], 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj4:
+	cmp.ltu		p6, p0 = r24, r8
+	sub		r14 = r24, r8
+	getf.sig	r31 = f43
+	sub		r8 = r21, r28
+	;;
+	st8		[r10] = r14, 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj3:
+	cmp.ltu		p6, p0 = r25, r8
+	sub		r14 = r25, r8
+	sub		r8 = r22, r29
+	;;
+	st8		[r10] = r14, 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj2:
+	cmp.ltu		p6, p0 = r26, r8
+	sub		r14 = r26, r8
+	sub		r8 = r23, r30
+	;;
+	st8		[r10] = r14, 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj1:
+	cmp.ltu		p6, p0 = r27, r8
+	sub		r14 = r27, r8
+	sub		r8 = r20, r31
+	;;
+	st8		[r10] = r14, 8
+	mov		ar.lc = r2
+   (p6)	add		r8 = 1, r8
+	br.ret.sptk.many b0
+.Ldone:	mov		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()