| dnl ARM64 mpn_rshift. |
| |
| dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of the GNU Lesser General Public License as published |
| dnl by the Free Software Foundation; either version 3 of the License, or (at |
| dnl your option) any later version. |
| |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| dnl License for more details. |
| |
| dnl You should have received a copy of the GNU Lesser General Public License |
| dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb assumed optimal c/l |
| C Cortex-A53 3.5-4.0 3.25 |
| C Cortex-A57 2.0 2.0 |
| C X-Gene 2.67 2.5 |
| |
| C TODO |
| C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These |
| C numbers should be 1 and 0, respectively. The str in wind-down should also |
| C go. |
| C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. |
| C * A53's speed depends on alignment, but not as simply as for lshift/lshiftc. |
| |
| changecom(blah) |
| |
| define(`rp_arg', `x0') |
| define(`up', `x1') |
| define(`n', `x2') |
| define(`cnt', `x3') |
| |
| define(`rp', `x16') |
| |
| define(`tnc',`x8') |
| |
| define(`PSHIFT', lsr) |
| define(`NSHIFT', lsl) |
| |
| ASM_START() |
| PROLOGUE(mpn_rshift) |
| mov rp, rp_arg |
| sub tnc, xzr, cnt |
| lsr x18, n, #2 |
| tbz n, #0, L(bx0) |
| |
| L(bx1): ldr x5, [up] |
| tbnz n, #1, L(b11) |
| |
| L(b01): NSHIFT x0, x5, tnc |
| PSHIFT x2, x5, cnt |
| cbnz x18, L(gt1) |
| str x2, [rp] |
| ret |
| L(gt1): ldp x4, x5, [up,#8] |
| sub up, up, #8 |
| sub rp, rp, #32 |
| b L(lo2) |
| |
| L(b11): NSHIFT x0, x5, tnc |
| PSHIFT x2, x5, cnt |
| ldp x6, x7, [up,#8]! |
| sub rp, rp, #16 |
| b L(lo3) |
| |
| L(bx0): ldp x4, x5, [up] |
| tbz n, #1, L(b00) |
| |
| L(b10): NSHIFT x0, x4, tnc |
| PSHIFT x13, x4, cnt |
| NSHIFT x10, x5, tnc |
| PSHIFT x2, x5, cnt |
| cbnz x18, L(gt2) |
| orr x10, x10, x13 |
| stp x10, x2, [rp] |
| ret |
| L(gt2): ldp x4, x5, [up,#16] |
| orr x10, x10, x13 |
| str x10, [rp],#-24 |
| b L(lo2) |
| |
| L(b00): NSHIFT x0, x4, tnc |
| PSHIFT x13, x4, cnt |
| NSHIFT x10, x5, tnc |
| PSHIFT x2, x5, cnt |
| ldp x6, x7, [up,#16]! |
| orr x10, x10, x13 |
| str x10, [rp],#-8 |
| b L(lo0) |
| |
| ALIGN(16) |
| L(top): ldp x4, x5, [up,#16] |
| orr x10, x10, x13 |
| orr x11, x12, x2 |
| stp x11, x10, [rp,#16] |
| PSHIFT x2, x7, cnt |
| L(lo2): NSHIFT x10, x5, tnc |
| NSHIFT x12, x4, tnc |
| PSHIFT x13, x4, cnt |
| ldp x6, x7, [up,#32]! |
| orr x10, x10, x13 |
| orr x11, x12, x2 |
| stp x11, x10, [rp,#32]! |
| PSHIFT x2, x5, cnt |
| L(lo0): sub x18, x18, #1 |
| L(lo3): NSHIFT x10, x7, tnc |
| NSHIFT x12, x6, tnc |
| PSHIFT x13, x6, cnt |
| cbnz x18, L(top) |
| |
| L(end): orr x10, x10, x13 |
| orr x11, x12, x2 |
| PSHIFT x2, x7, cnt |
| stp x11, x10, [rp,#16] |
| str x2, [rp,#32] |
| ret |
| EPILOGUE() |