Austin Schuh | bb1338c | 2024-06-15 19:31:16 -0700 | [diff] [blame] | 1 | dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt |
| 2 | |
| 3 | dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc. |
| 4 | |
| 5 | dnl This file is part of the GNU MP Library. |
| 6 | dnl |
| 7 | dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| 8 | dnl it under the terms of either: |
| 9 | dnl |
| 10 | dnl * the GNU Lesser General Public License as published by the Free |
| 11 | dnl Software Foundation; either version 3 of the License, or (at your |
| 12 | dnl option) any later version. |
| 13 | dnl |
| 14 | dnl or |
| 15 | dnl |
| 16 | dnl * the GNU General Public License as published by the Free Software |
| 17 | dnl Foundation; either version 2 of the License, or (at your option) any |
| 18 | dnl later version. |
| 19 | dnl |
| 20 | dnl or both in parallel, as here. |
| 21 | dnl |
| 22 | dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| 23 | dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 24 | dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 25 | dnl for more details. |
| 26 | dnl |
| 27 | dnl You should have received copies of the GNU General Public License and the |
| 28 | dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| 29 | dnl see https://www.gnu.org/licenses/. |
| 30 | |
| 31 | include(`../config.m4') |
| 32 | |
| 33 | C cycles/limb |
| 34 | C POWER3/PPC630 ? |
| 35 | C POWER4/PPC970 ? |
| 36 | C POWER5 2.25 |
| 37 | C POWER6 9.75 |
| 38 | C POWER7 2.15 |
| 39 | |
| 40 | C TODO |
| 41 | C * Try to reduce the number of needed live registers |
| 42 | C * Micro-optimise header code |
| 43 | C * Keep in synch with lshift.asm and lshiftc.asm |
| 44 | |
| 45 | C INPUT PARAMETERS |
| 46 | define(`rp', `r3') |
| 47 | define(`up', `r4') |
| 48 | define(`n', `r5') |
| 49 | define(`cnt', `r6') |
| 50 | |
| 51 | define(`tnc',`r0') |
| 52 | define(`u0',`r30') |
| 53 | define(`u1',`r31') |
| 54 | define(`retval',`r5') |
| 55 | |
| 56 | ASM_START() |
| 57 | PROLOGUE(mpn_rshift) |
| 58 | std r31, -8(r1) |
| 59 | std r30, -16(r1) |
| 60 | subfic tnc, cnt, 64 |
| 61 | C sldi r30, n, 3 C byte count corresponding to n |
| 62 | C add rp, rp, r30 C rp = rp + n |
| 63 | C add up, up, r30 C up = up + n |
| 64 | rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 |
| 65 | cmpdi cr6, r30, 2 |
| 66 | addi r31, n, 3 C compute count... |
| 67 | ld r10, 0(up) C load 1st limb for b00...b11 |
| 68 | sld retval, r10, tnc |
| 69 | ifdef(`HAVE_ABI_mode32', |
| 70 | ` rldicl r31, r31, 62,34', C ...branch count |
| 71 | ` srdi r31, r31, 2') C ...for ctr |
| 72 | mtctr r31 C copy count into ctr |
| 73 | beq cr0, L(b00) |
| 74 | blt cr6, L(b01) |
| 75 | ld r11, 8(up) C load 2nd limb for b10 and b11 |
| 76 | beq cr6, L(b10) |
| 77 | |
| 78 | ALIGN(16) |
| 79 | L(b11): srd r8, r10, cnt |
| 80 | sld r9, r11, tnc |
| 81 | ld u1, 16(up) |
| 82 | addi up, up, 24 |
| 83 | srd r12, r11, cnt |
| 84 | sld r7, u1, tnc |
| 85 | addi rp, rp, -16 |
| 86 | bdnz L(gt3) |
| 87 | |
| 88 | or r11, r8, r9 |
| 89 | srd r8, u1, cnt |
| 90 | b L(cj3) |
| 91 | |
| 92 | ALIGN(16) |
| 93 | L(gt3): ld u0, 0(up) |
| 94 | or r11, r8, r9 |
| 95 | srd r8, u1, cnt |
| 96 | sld r9, u0, tnc |
| 97 | ld u1, 8(up) |
| 98 | or r10, r12, r7 |
| 99 | b L(L11) |
| 100 | |
| 101 | ALIGN(32) |
| 102 | L(b10): srd r12, r10, cnt |
| 103 | addi rp, rp, -24 |
| 104 | sld r7, r11, tnc |
| 105 | bdnz L(gt2) |
| 106 | |
| 107 | srd r8, r11, cnt |
| 108 | or r10, r12, r7 |
| 109 | b L(cj2) |
| 110 | |
| 111 | L(gt2): ld u0, 16(up) |
| 112 | srd r8, r11, cnt |
| 113 | sld r9, u0, tnc |
| 114 | ld u1, 24(up) |
| 115 | or r10, r12, r7 |
| 116 | srd r12, u0, cnt |
| 117 | sld r7, u1, tnc |
| 118 | ld u0, 32(up) |
| 119 | or r11, r8, r9 |
| 120 | addi up, up, 16 |
| 121 | b L(L10) |
| 122 | |
| 123 | ALIGN(16) |
| 124 | L(b00): ld u1, 8(up) |
| 125 | srd r12, r10, cnt |
| 126 | sld r7, u1, tnc |
| 127 | ld u0, 16(up) |
| 128 | srd r8, u1, cnt |
| 129 | sld r9, u0, tnc |
| 130 | ld u1, 24(up) |
| 131 | or r10, r12, r7 |
| 132 | srd r12, u0, cnt |
| 133 | sld r7, u1, tnc |
| 134 | addi rp, rp, -8 |
| 135 | bdz L(cj4) |
| 136 | |
| 137 | L(gt4): addi up, up, 32 |
| 138 | ld u0, 0(up) |
| 139 | or r11, r8, r9 |
| 140 | b L(L00) |
| 141 | |
| 142 | ALIGN(16) |
| 143 | L(b01): bdnz L(gt1) |
| 144 | srd r8, r10, cnt |
| 145 | std r8, 0(rp) |
| 146 | b L(ret) |
| 147 | |
| 148 | L(gt1): ld u0, 8(up) |
| 149 | srd r8, r10, cnt |
| 150 | sld r9, u0, tnc |
| 151 | ld u1, 16(up) |
| 152 | srd r12, u0, cnt |
| 153 | sld r7, u1, tnc |
| 154 | ld u0, 24(up) |
| 155 | or r11, r8, r9 |
| 156 | srd r8, u1, cnt |
| 157 | sld r9, u0, tnc |
| 158 | ld u1, 32(up) |
| 159 | addi up, up, 40 |
| 160 | or r10, r12, r7 |
| 161 | bdz L(end) |
| 162 | |
| 163 | ALIGN(32) |
| 164 | L(top): srd r12, u0, cnt |
| 165 | sld r7, u1, tnc |
| 166 | ld u0, 0(up) |
| 167 | std r11, 0(rp) |
| 168 | or r11, r8, r9 |
| 169 | L(L00): srd r8, u1, cnt |
| 170 | sld r9, u0, tnc |
| 171 | ld u1, 8(up) |
| 172 | std r10, 8(rp) |
| 173 | or r10, r12, r7 |
| 174 | L(L11): srd r12, u0, cnt |
| 175 | sld r7, u1, tnc |
| 176 | ld u0, 16(up) |
| 177 | std r11, 16(rp) |
| 178 | or r11, r8, r9 |
| 179 | L(L10): srd r8, u1, cnt |
| 180 | sld r9, u0, tnc |
| 181 | ld u1, 24(up) |
| 182 | addi up, up, 32 |
| 183 | std r10, 24(rp) |
| 184 | addi rp, rp, 32 |
| 185 | or r10, r12, r7 |
| 186 | bdnz L(top) |
| 187 | |
| 188 | ALIGN(32) |
| 189 | L(end): srd r12, u0, cnt |
| 190 | sld r7, u1, tnc |
| 191 | std r11, 0(rp) |
| 192 | L(cj4): or r11, r8, r9 |
| 193 | srd r8, u1, cnt |
| 194 | std r10, 8(rp) |
| 195 | L(cj3): or r10, r12, r7 |
| 196 | std r11, 16(rp) |
| 197 | L(cj2): std r10, 24(rp) |
| 198 | std r8, 32(rp) |
| 199 | |
| 200 | L(ret): ld r31, -8(r1) |
| 201 | ld r30, -16(r1) |
| 202 | ifdef(`HAVE_ABI_mode32', |
| 203 | ` srdi r3, retval, 32 |
| 204 | mr r4, retval |
| 205 | ',` mr r3, retval') |
| 206 | blr |
| 207 | EPILOGUE() |