Austin Schuh | dace2a6 | 2020-08-18 10:56:48 -0700 | [diff] [blame] | 1 | /* mpn_divexact_by3c -- mpn exact division by 3. |
| 2 | |
| 3 | Copyright 2000-2003, 2008 Free Software Foundation, Inc. |
| 4 | |
| 5 | This file is part of the GNU MP Library. |
| 6 | |
| 7 | The GNU MP Library is free software; you can redistribute it and/or modify |
| 8 | it under the terms of either: |
| 9 | |
| 10 | * the GNU Lesser General Public License as published by the Free |
| 11 | Software Foundation; either version 3 of the License, or (at your |
| 12 | option) any later version. |
| 13 | |
| 14 | or |
| 15 | |
| 16 | * the GNU General Public License as published by the Free Software |
| 17 | Foundation; either version 2 of the License, or (at your option) any |
| 18 | later version. |
| 19 | |
| 20 | or both in parallel, as here. |
| 21 | |
| 22 | The GNU MP Library is distributed in the hope that it will be useful, but |
| 23 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 24 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 25 | for more details. |
| 26 | |
| 27 | You should have received copies of the GNU General Public License and the |
| 28 | GNU Lesser General Public License along with the GNU MP Library. If not, |
| 29 | see https://www.gnu.org/licenses/. */ |
| 30 | |
| 31 | #include "gmp-impl.h" |
| 32 | |
| 33 | #if DIVEXACT_BY3_METHOD == 0 |
| 34 | |
| 35 | mp_limb_t |
| 36 | mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c) |
| 37 | { |
| 38 | mp_limb_t r; |
| 39 | r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c); |
| 40 | |
| 41 | /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3. |
| 42 | We want to return C. We compute the remainder mod 4 and notice that the |
| 43 | inverse of (2^(2k)-1)/3 mod 4 is 1. */ |
| 44 | return r & 3; |
| 45 | } |
| 46 | |
| 47 | #endif |
| 48 | |
| 49 | #if DIVEXACT_BY3_METHOD == 1 |
| 50 | |
| 51 | /* The algorithm here is basically the same as mpn_divexact_1, as described |
| 52 | in the manual. Namely at each step q = (src[i]-c)*inverse, and new c = |
| 53 | borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3, |
| 54 | high(divisor*q) can be determined with two comparisons instead of a |
| 55 | multiply. |
| 56 | |
| 57 | The "c += ..."s add the high limb of 3*l to c. That high limb will be 0, |
| 58 | 1 or 2. Doing two separate "+="s seems to give better code on gcc (as of |
| 59 | 2.95.2 at least). |
| 60 | |
| 61 | It will be noted that the new c is formed by adding three values each 0 |
| 62 | or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c |
| 63 | causes a borrow, that leaves a limb value of either 0xFF...FF or |
| 64 | 0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or |
| 65 | 0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1 |
| 66 | respectively, hence a total of no more than 2. |
| 67 | |
| 68 | Alternatives: |
| 69 | |
| 70 | This implementation has each multiply on the dependent chain, due to |
| 71 | "l=s-c". See below for alternative code which avoids that. */ |
| 72 | |
| 73 | mp_limb_t |
| 74 | mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c) |
| 75 | { |
| 76 | mp_limb_t l, q, s; |
| 77 | mp_size_t i; |
| 78 | |
| 79 | ASSERT (un >= 1); |
| 80 | ASSERT (c == 0 || c == 1 || c == 2); |
| 81 | ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); |
| 82 | |
| 83 | i = 0; |
| 84 | do |
| 85 | { |
| 86 | s = up[i]; |
| 87 | SUBC_LIMB (c, l, s, c); |
| 88 | |
| 89 | q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; |
| 90 | rp[i] = q; |
| 91 | |
| 92 | c += (q >= GMP_NUMB_CEIL_MAX_DIV3); |
| 93 | c += (q >= GMP_NUMB_CEIL_2MAX_DIV3); |
| 94 | } |
| 95 | while (++i < un); |
| 96 | |
| 97 | ASSERT (c == 0 || c == 1 || c == 2); |
| 98 | return c; |
| 99 | } |
| 100 | |
| 101 | |
| 102 | #endif |
| 103 | |
| 104 | #if DIVEXACT_BY3_METHOD == 2 |
| 105 | |
| 106 | /* The following alternative code re-arranges the quotient calculation from |
| 107 | (src[i]-c)*inverse to instead |
| 108 | |
| 109 | q = src[i]*inverse - c*inverse |
| 110 | |
| 111 | thereby allowing src[i]*inverse to be scheduled back as far as desired, |
| 112 | making full use of multiplier throughput and leaving just some carry |
| 113 | handing on the dependent chain. |
| 114 | |
| 115 | The carry handling consists of determining the c for the next iteration. |
| 116 | This is the same as described above, namely look for any borrow from |
| 117 | src[i]-c, and at the high of 3*q. |
| 118 | |
| 119 | high(3*q) is done with two comparisons as above (in c2 and c3). The |
| 120 | borrow from src[i]-c is incorporated into those by noting that if there's |
| 121 | a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn |
| 122 | giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is |
| 123 | enough to make high(3*q) come out 1 bigger, as required. |
| 124 | |
| 125 | l = -c*inverse is calculated at the same time as c, since for most chips |
| 126 | it can be more conveniently derived from separate c1/c2/c3 values than |
| 127 | from a combined c equal to 0, 1 or 2. |
| 128 | |
| 129 | The net effect is that with good pipelining this loop should be able to |
| 130 | run at perhaps 4 cycles/limb, depending on available execute resources |
| 131 | etc. |
| 132 | |
| 133 | Usage: |
| 134 | |
| 135 | This code is not used by default, since we really can't rely on the |
| 136 | compiler generating a good software pipeline, nor on such an approach |
| 137 | even being worthwhile on all CPUs. |
| 138 | |
| 139 | Itanium is one chip where this algorithm helps though, see |
| 140 | mpn/ia64/diveby3.asm. */ |
| 141 | |
| 142 | mp_limb_t |
| 143 | mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy) |
| 144 | { |
| 145 | mp_limb_t s, sm, cl, q, qx, c2, c3; |
| 146 | mp_size_t i; |
| 147 | |
| 148 | ASSERT (un >= 1); |
| 149 | ASSERT (cy == 0 || cy == 1 || cy == 2); |
| 150 | ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); |
| 151 | |
| 152 | cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3; |
| 153 | |
| 154 | for (i = 0; i < un; i++) |
| 155 | { |
| 156 | s = up[i]; |
| 157 | sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; |
| 158 | |
| 159 | q = (cl + sm) & GMP_NUMB_MASK; |
| 160 | rp[i] = q; |
| 161 | qx = q + (s < cy); |
| 162 | |
| 163 | c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3; |
| 164 | c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ; |
| 165 | |
| 166 | cy = c2 + c3; |
| 167 | cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3); |
| 168 | } |
| 169 | |
| 170 | return cy; |
| 171 | } |
| 172 | |
| 173 | #endif |