Blame - third_party/gmp/mpn/sparc64/mode1o.c - RealtimeRoboticsGroup/test

blob: 771c99915ca5eb116cc8e915ba32c8266945d850 [file] [log] [blame]

Austin Schuh	dace2a6	2020-08-18 10:56:48 -0700	[diff] [blame]	1	/* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder.
				2
				3	THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
				4	CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
				5	FUTURE GNU MP RELEASES.
				6
				7	Copyright 2000-2003 Free Software Foundation, Inc.
				8
				9	This file is part of the GNU MP Library.
				10
				11	The GNU MP Library is free software; you can redistribute it and/or modify
				12	it under the terms of either:
				13
				14	* the GNU Lesser General Public License as published by the Free
				15	Software Foundation; either version 3 of the License, or (at your
				16	option) any later version.
				17
				18	or
				19
				20	* the GNU General Public License as published by the Free Software
				21	Foundation; either version 2 of the License, or (at your option) any
				22	later version.
				23
				24	or both in parallel, as here.
				25
				26	The GNU MP Library is distributed in the hope that it will be useful, but
				27	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				28	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				29	for more details.
				30
				31	You should have received copies of the GNU General Public License and the
				32	GNU Lesser General Public License along with the GNU MP Library. If not,
				33	see https://www.gnu.org/licenses/. */
				34
				35	#include "gmp-impl.h"
				36	#include "longlong.h"
				37
				38	#include "mpn/sparc64/sparc64.h"
				39
				40
				41	/* 64-bit divisor 32-bit divisor
				42	cycles/limb cycles/limb
				43	(approx) (approx)
				44	Ultrasparc 2i: ? ?
				45	*/
				46
				47
				48	/* This implementation reduces the number of multiplies done, knowing that
				49	on ultrasparc 1 and 2 the mulx instruction stalls the whole chip.
				50
				51	The key idea is to use the fact that the low limb of q*d equals l, this
				52	being the whole purpose of the q calculated. It means there's no need to
				53	calculate the lowest 32x32->64 part of the q*d, instead it can be
				54	inferred from l and the other three 32x32->64 parts. See sparc64.h for
				55	details.
				56
				57	When d is 32-bits, the same applies, but in this case there's only one
				58	other 32x32->64 part (ie. HIGH(q)*d).
				59
				60	The net effect is that for 64-bit divisor each limb is 4 mulx, or for
				61	32-bit divisor each is 2 mulx.
				62
				63	Enhancements:
				64
				65	No doubt this could be done in assembler, if that helped the scheduling,
				66	or perhaps guaranteed good code irrespective of the compiler.
				67
				68	Alternatives:
				69
				70	It might be possibly to use floating point. The loop is dominated by
				71	multiply latency, so not sure if floats would improve that. One
				72	possibility would be to take two limbs at a time, with a 128 bit inverse,
				73	if there's enough registers, which could effectively use float throughput
				74	to reduce total latency across two limbs. */
				75
				76	#define ASSERT_RETVAL(r) \
				77	ASSERT (orig_c < d ? r < d : r <= d)
				78
				79	mp_limb_t
				80	mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c)
				81	{
				82	mp_limb_t c = orig_c;
				83	mp_limb_t s, l, q, h, inverse;
				84
				85	ASSERT (size >= 1);
				86	ASSERT (d & 1);
				87	ASSERT_MPN (src, size);
				88	ASSERT_LIMB (d);
				89	ASSERT_LIMB (c);
				90
				91	/* udivx is faster than 10 or 12 mulx's for one limb via an inverse */
				92	if (size == 1)
				93	{
				94	s = src[0];
				95	if (s > c)
				96	{
				97	l = s-c;
				98	h = l % d;
				99	if (h != 0)
				100	h = d - h;
				101	}
				102	else
				103	{
				104	l = c-s;
				105	h = l % d;
				106	}
				107	return h;
				108	}
				109
				110	binvert_limb (inverse, d);
				111
				112	if (d <= 0xFFFFFFFF)
				113	{
				114	s = *src++;
				115	size--;
				116	do
				117	{
				118	SUBC_LIMB (c, l, s, c);
				119	s = *src++;
				120	q = l * inverse;
				121	umul_ppmm_half_lowequal (h, q, d, l);
				122	c += h;
				123	size--;
				124	}
				125	while (size != 0);
				126
				127	if (s <= d)
				128	{
				129	/* With high s <= d the final step can be a subtract and addback.
				130	If c==0 then the addback will restore to l>=0. If c==d then
				131	will get l==d if s==0, but that's ok per the function
				132	definition. */
				133
				134	l = c - s;
				135	l += (l > c ? d : 0);
				136
				137	ASSERT_RETVAL (l);
				138	return l;
				139	}
				140	else
				141	{
				142	/* Can't skip a divide, just do the loop code once more. */
				143	SUBC_LIMB (c, l, s, c);
				144	q = l * inverse;
				145	umul_ppmm_half_lowequal (h, q, d, l);
				146	c += h;
				147
				148	ASSERT_RETVAL (c);
				149	return c;
				150	}
				151	}
				152	else
				153	{
				154	mp_limb_t dl = LOW32 (d);
				155	mp_limb_t dh = HIGH32 (d);
				156	long i;
				157
				158	s = *src++;
				159	size--;
				160	do
				161	{
				162	SUBC_LIMB (c, l, s, c);
				163	s = *src++;
				164	q = l * inverse;
				165	umul_ppmm_lowequal (h, q, d, dh, dl, l);
				166	c += h;
				167	size--;
				168	}
				169	while (size != 0);
				170
				171	if (s <= d)
				172	{
				173	/* With high s <= d the final step can be a subtract and addback.
				174	If c==0 then the addback will restore to l>=0. If c==d then
				175	will get l==d if s==0, but that's ok per the function
				176	definition. */
				177
				178	l = c - s;
				179	l += (l > c ? d : 0);
				180
				181	ASSERT_RETVAL (l);
				182	return l;
				183	}
				184	else
				185	{
				186	/* Can't skip a divide, just do the loop code once more. */
				187	SUBC_LIMB (c, l, s, c);
				188	q = l * inverse;
				189	umul_ppmm_lowequal (h, q, d, dh, dl, l);
				190	c += h;
				191
				192	ASSERT_RETVAL (c);
				193	return c;
				194	}
				195	}
				196	}