Austin Schuh | bb1338c | 2024-06-15 19:31:16 -0700 | [diff] [blame] | 1 | dnl ARM64 mpn_rshift. |
| 2 | |
| 3 | dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. |
| 4 | |
| 5 | dnl This file is part of the GNU MP Library. |
| 6 | |
| 7 | dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| 8 | dnl it under the terms of the GNU Lesser General Public License as published |
| 9 | dnl by the Free Software Foundation; either version 3 of the License, or (at |
| 10 | dnl your option) any later version. |
| 11 | |
| 12 | dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| 13 | dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 14 | dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| 15 | dnl License for more details. |
| 16 | |
| 17 | dnl You should have received a copy of the GNU Lesser General Public License |
| 18 | dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| 19 | |
| 20 | include(`../config.m4') |
| 21 | |
| 22 | C cycles/limb assumed optimal c/l |
| 23 | C Cortex-A53 3.5-4.0 3.25 |
| 24 | C Cortex-A57 2.0 2.0 |
| 25 | C X-Gene 2.67 2.5 |
| 26 | |
| 27 | C TODO |
| 28 | C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These |
| 29 | C numbers should be 1 and 0, respectively. The str in wind-down should also |
| 30 | C go. |
| 31 | C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. |
| 32 | C * A53's speed depends on alignment, but not as simply as for lshift/lshiftc. |
| 33 | |
| 34 | changecom(blah) |
| 35 | |
| 36 | define(`rp_arg', `x0') |
| 37 | define(`up', `x1') |
| 38 | define(`n', `x2') |
| 39 | define(`cnt', `x3') |
| 40 | |
| 41 | define(`rp', `x16') |
| 42 | |
| 43 | define(`tnc',`x8') |
| 44 | |
| 45 | define(`PSHIFT', lsr) |
| 46 | define(`NSHIFT', lsl) |
| 47 | |
| 48 | ASM_START() |
| 49 | PROLOGUE(mpn_rshift) |
| 50 | mov rp, rp_arg |
| 51 | sub tnc, xzr, cnt |
| 52 | lsr x18, n, #2 |
| 53 | tbz n, #0, L(bx0) |
| 54 | |
| 55 | L(bx1): ldr x5, [up] |
| 56 | tbnz n, #1, L(b11) |
| 57 | |
| 58 | L(b01): NSHIFT x0, x5, tnc |
| 59 | PSHIFT x2, x5, cnt |
| 60 | cbnz x18, L(gt1) |
| 61 | str x2, [rp] |
| 62 | ret |
| 63 | L(gt1): ldp x4, x5, [up,#8] |
| 64 | sub up, up, #8 |
| 65 | sub rp, rp, #32 |
| 66 | b L(lo2) |
| 67 | |
| 68 | L(b11): NSHIFT x0, x5, tnc |
| 69 | PSHIFT x2, x5, cnt |
| 70 | ldp x6, x7, [up,#8]! |
| 71 | sub rp, rp, #16 |
| 72 | b L(lo3) |
| 73 | |
| 74 | L(bx0): ldp x4, x5, [up] |
| 75 | tbz n, #1, L(b00) |
| 76 | |
| 77 | L(b10): NSHIFT x0, x4, tnc |
| 78 | PSHIFT x13, x4, cnt |
| 79 | NSHIFT x10, x5, tnc |
| 80 | PSHIFT x2, x5, cnt |
| 81 | cbnz x18, L(gt2) |
| 82 | orr x10, x10, x13 |
| 83 | stp x10, x2, [rp] |
| 84 | ret |
| 85 | L(gt2): ldp x4, x5, [up,#16] |
| 86 | orr x10, x10, x13 |
| 87 | str x10, [rp],#-24 |
| 88 | b L(lo2) |
| 89 | |
| 90 | L(b00): NSHIFT x0, x4, tnc |
| 91 | PSHIFT x13, x4, cnt |
| 92 | NSHIFT x10, x5, tnc |
| 93 | PSHIFT x2, x5, cnt |
| 94 | ldp x6, x7, [up,#16]! |
| 95 | orr x10, x10, x13 |
| 96 | str x10, [rp],#-8 |
| 97 | b L(lo0) |
| 98 | |
| 99 | ALIGN(16) |
| 100 | L(top): ldp x4, x5, [up,#16] |
| 101 | orr x10, x10, x13 |
| 102 | orr x11, x12, x2 |
| 103 | stp x11, x10, [rp,#16] |
| 104 | PSHIFT x2, x7, cnt |
| 105 | L(lo2): NSHIFT x10, x5, tnc |
| 106 | NSHIFT x12, x4, tnc |
| 107 | PSHIFT x13, x4, cnt |
| 108 | ldp x6, x7, [up,#32]! |
| 109 | orr x10, x10, x13 |
| 110 | orr x11, x12, x2 |
| 111 | stp x11, x10, [rp,#32]! |
| 112 | PSHIFT x2, x5, cnt |
| 113 | L(lo0): sub x18, x18, #1 |
| 114 | L(lo3): NSHIFT x10, x7, tnc |
| 115 | NSHIFT x12, x6, tnc |
| 116 | PSHIFT x13, x6, cnt |
| 117 | cbnz x18, L(top) |
| 118 | |
| 119 | L(end): orr x10, x10, x13 |
| 120 | orr x11, x12, x2 |
| 121 | PSHIFT x2, x7, cnt |
| 122 | stp x11, x10, [rp,#16] |
| 123 | str x2, [rp,#32] |
| 124 | ret |
| 125 | EPILOGUE() |