Austin Schuh | bb1338c | 2024-06-15 19:31:16 -0700 | [diff] [blame] | 1 | dnl ARM64 mpn_lshift. |
| 2 | |
| 3 | dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. |
| 4 | |
| 5 | dnl This file is part of the GNU MP Library. |
| 6 | |
| 7 | dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| 8 | dnl it under the terms of the GNU Lesser General Public License as published |
| 9 | dnl by the Free Software Foundation; either version 3 of the License, or (at |
| 10 | dnl your option) any later version. |
| 11 | |
| 12 | dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| 13 | dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 14 | dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| 15 | dnl License for more details. |
| 16 | |
| 17 | dnl You should have received a copy of the GNU Lesser General Public License |
| 18 | dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| 19 | |
| 20 | include(`../config.m4') |
| 21 | |
| 22 | C cycles/limb assumed optimal c/l |
| 23 | C Cortex-A53 3.5-4.0 3.25 |
| 24 | C Cortex-A57 2.0 2.0 |
| 25 | C X-Gene 2.67 2.5 |
| 26 | |
| 27 | C TODO |
| 28 | C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These |
| 29 | C numbers should be 1 and 0, respectively. The str in wind-down should also |
| 30 | C go. |
| 31 | C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. |
| 32 | C * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0. |
| 33 | |
| 34 | changecom(blah) |
| 35 | |
| 36 | define(`rp_arg', `x0') |
| 37 | define(`up', `x1') |
| 38 | define(`n', `x2') |
| 39 | define(`cnt', `x3') |
| 40 | |
| 41 | define(`rp', `x16') |
| 42 | |
| 43 | define(`tnc',`x8') |
| 44 | |
| 45 | define(`PSHIFT', lsl) |
| 46 | define(`NSHIFT', lsr) |
| 47 | |
| 48 | ASM_START() |
| 49 | PROLOGUE(mpn_lshift) |
| 50 | add rp, rp_arg, n, lsl #3 |
| 51 | add up, up, n, lsl #3 |
| 52 | sub tnc, xzr, cnt |
| 53 | lsr x18, n, #2 |
| 54 | tbz n, #0, L(bx0) |
| 55 | |
| 56 | L(bx1): ldr x4, [up,#-8] |
| 57 | tbnz n, #1, L(b11) |
| 58 | |
| 59 | L(b01): NSHIFT x0, x4, tnc |
| 60 | PSHIFT x2, x4, cnt |
| 61 | cbnz x18, L(gt1) |
| 62 | str x2, [rp,#-8] |
| 63 | ret |
| 64 | L(gt1): ldp x4, x5, [up,#-24] |
| 65 | sub up, up, #8 |
| 66 | add rp, rp, #16 |
| 67 | b L(lo2) |
| 68 | |
| 69 | L(b11): NSHIFT x0, x4, tnc |
| 70 | PSHIFT x2, x4, cnt |
| 71 | ldp x6, x7, [up,#-24]! |
| 72 | b L(lo3) |
| 73 | |
| 74 | L(bx0): ldp x4, x5, [up,#-16] |
| 75 | tbz n, #1, L(b00) |
| 76 | |
| 77 | L(b10): NSHIFT x0, x5, tnc |
| 78 | PSHIFT x13, x5, cnt |
| 79 | NSHIFT x10, x4, tnc |
| 80 | PSHIFT x2, x4, cnt |
| 81 | cbnz x18, L(gt2) |
| 82 | orr x10, x10, x13 |
| 83 | stp x2, x10, [rp,#-16] |
| 84 | ret |
| 85 | L(gt2): ldp x4, x5, [up,#-32] |
| 86 | orr x10, x10, x13 |
| 87 | str x10, [rp,#-8] |
| 88 | sub up, up, #16 |
| 89 | add rp, rp, #8 |
| 90 | b L(lo2) |
| 91 | |
| 92 | L(b00): NSHIFT x0, x5, tnc |
| 93 | PSHIFT x13, x5, cnt |
| 94 | NSHIFT x10, x4, tnc |
| 95 | PSHIFT x2, x4, cnt |
| 96 | ldp x6, x7, [up,#-32]! |
| 97 | orr x10, x10, x13 |
| 98 | str x10, [rp,#-8]! |
| 99 | b L(lo0) |
| 100 | |
| 101 | ALIGN(16) |
| 102 | L(top): ldp x4, x5, [up,#-16] |
| 103 | orr x10, x10, x13 |
| 104 | orr x11, x12, x2 |
| 105 | stp x10, x11, [rp,#-16] |
| 106 | PSHIFT x2, x6, cnt |
| 107 | L(lo2): NSHIFT x10, x4, tnc |
| 108 | PSHIFT x13, x5, cnt |
| 109 | NSHIFT x12, x5, tnc |
| 110 | ldp x6, x7, [up,#-32]! |
| 111 | orr x10, x10, x13 |
| 112 | orr x11, x12, x2 |
| 113 | stp x10, x11, [rp,#-32]! |
| 114 | PSHIFT x2, x4, cnt |
| 115 | L(lo0): sub x18, x18, #1 |
| 116 | L(lo3): NSHIFT x10, x6, tnc |
| 117 | PSHIFT x13, x7, cnt |
| 118 | NSHIFT x12, x7, tnc |
| 119 | cbnz x18, L(top) |
| 120 | |
| 121 | L(end): orr x10, x10, x13 |
| 122 | orr x11, x12, x2 |
| 123 | PSHIFT x2, x6, cnt |
| 124 | stp x10, x11, [rp,#-16] |
| 125 | str x2, [rp,#-24] |
| 126 | ret |
| 127 | EPILOGUE() |