| dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt |
| |
| dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of either: |
| dnl |
| dnl * the GNU Lesser General Public License as published by the Free |
| dnl Software Foundation; either version 3 of the License, or (at your |
| dnl option) any later version. |
| dnl |
| dnl or |
| dnl |
| dnl * the GNU General Public License as published by the Free Software |
| dnl Foundation; either version 2 of the License, or (at your option) any |
| dnl later version. |
| dnl |
| dnl or both in parallel, as here. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| dnl for more details. |
| dnl |
| dnl You should have received copies of the GNU General Public License and the |
| dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| dnl see https://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C POWER3/PPC630 ? |
| C POWER4/PPC970 ? |
| C POWER5 2.25 |
| C POWER6 9.75 |
| C POWER7 2.15 |
| |
| C TODO |
| C * Try to reduce the number of needed live registers |
| C * Micro-optimise header code |
| C * Keep in synch with lshift.asm and lshiftc.asm |
| |
| C INPUT PARAMETERS |
| define(`rp', `r3') |
| define(`up', `r4') |
| define(`n', `r5') |
| define(`cnt', `r6') |
| |
| define(`tnc',`r0') |
| define(`u0',`r30') |
| define(`u1',`r31') |
| define(`retval',`r5') |
| |
| ASM_START() |
| PROLOGUE(mpn_rshift) |
| std r31, -8(r1) |
| std r30, -16(r1) |
| subfic tnc, cnt, 64 |
| C sldi r30, n, 3 C byte count corresponding to n |
| C add rp, rp, r30 C rp = rp + n |
| C add up, up, r30 C up = up + n |
| rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 |
| cmpdi cr6, r30, 2 |
| addi r31, n, 3 C compute count... |
| ld r10, 0(up) C load 1st limb for b00...b11 |
| sld retval, r10, tnc |
| ifdef(`HAVE_ABI_mode32', |
| ` rldicl r31, r31, 62,34', C ...branch count |
| ` srdi r31, r31, 2') C ...for ctr |
| mtctr r31 C copy count into ctr |
| beq cr0, L(b00) |
| blt cr6, L(b01) |
| ld r11, 8(up) C load 2nd limb for b10 and b11 |
| beq cr6, L(b10) |
| |
| ALIGN(16) |
| L(b11): srd r8, r10, cnt |
| sld r9, r11, tnc |
| ld u1, 16(up) |
| addi up, up, 24 |
| srd r12, r11, cnt |
| sld r7, u1, tnc |
| addi rp, rp, -16 |
| bdnz L(gt3) |
| |
| or r11, r8, r9 |
| srd r8, u1, cnt |
| b L(cj3) |
| |
| ALIGN(16) |
| L(gt3): ld u0, 0(up) |
| or r11, r8, r9 |
| srd r8, u1, cnt |
| sld r9, u0, tnc |
| ld u1, 8(up) |
| or r10, r12, r7 |
| b L(L11) |
| |
| ALIGN(32) |
| L(b10): srd r12, r10, cnt |
| addi rp, rp, -24 |
| sld r7, r11, tnc |
| bdnz L(gt2) |
| |
| srd r8, r11, cnt |
| or r10, r12, r7 |
| b L(cj2) |
| |
| L(gt2): ld u0, 16(up) |
| srd r8, r11, cnt |
| sld r9, u0, tnc |
| ld u1, 24(up) |
| or r10, r12, r7 |
| srd r12, u0, cnt |
| sld r7, u1, tnc |
| ld u0, 32(up) |
| or r11, r8, r9 |
| addi up, up, 16 |
| b L(L10) |
| |
| ALIGN(16) |
| L(b00): ld u1, 8(up) |
| srd r12, r10, cnt |
| sld r7, u1, tnc |
| ld u0, 16(up) |
| srd r8, u1, cnt |
| sld r9, u0, tnc |
| ld u1, 24(up) |
| or r10, r12, r7 |
| srd r12, u0, cnt |
| sld r7, u1, tnc |
| addi rp, rp, -8 |
| bdz L(cj4) |
| |
| L(gt4): addi up, up, 32 |
| ld u0, 0(up) |
| or r11, r8, r9 |
| b L(L00) |
| |
| ALIGN(16) |
| L(b01): bdnz L(gt1) |
| srd r8, r10, cnt |
| std r8, 0(rp) |
| b L(ret) |
| |
| L(gt1): ld u0, 8(up) |
| srd r8, r10, cnt |
| sld r9, u0, tnc |
| ld u1, 16(up) |
| srd r12, u0, cnt |
| sld r7, u1, tnc |
| ld u0, 24(up) |
| or r11, r8, r9 |
| srd r8, u1, cnt |
| sld r9, u0, tnc |
| ld u1, 32(up) |
| addi up, up, 40 |
| or r10, r12, r7 |
| bdz L(end) |
| |
| ALIGN(32) |
| L(top): srd r12, u0, cnt |
| sld r7, u1, tnc |
| ld u0, 0(up) |
| std r11, 0(rp) |
| or r11, r8, r9 |
| L(L00): srd r8, u1, cnt |
| sld r9, u0, tnc |
| ld u1, 8(up) |
| std r10, 8(rp) |
| or r10, r12, r7 |
| L(L11): srd r12, u0, cnt |
| sld r7, u1, tnc |
| ld u0, 16(up) |
| std r11, 16(rp) |
| or r11, r8, r9 |
| L(L10): srd r8, u1, cnt |
| sld r9, u0, tnc |
| ld u1, 24(up) |
| addi up, up, 32 |
| std r10, 24(rp) |
| addi rp, rp, 32 |
| or r10, r12, r7 |
| bdnz L(top) |
| |
| ALIGN(32) |
| L(end): srd r12, u0, cnt |
| sld r7, u1, tnc |
| std r11, 0(rp) |
| L(cj4): or r11, r8, r9 |
| srd r8, u1, cnt |
| std r10, 8(rp) |
| L(cj3): or r10, r12, r7 |
| std r11, 16(rp) |
| L(cj2): std r10, 24(rp) |
| std r8, 32(rp) |
| |
| L(ret): ld r31, -8(r1) |
| ld r30, -16(r1) |
| ifdef(`HAVE_ABI_mode32', |
| ` srdi r3, retval, 32 |
| mr r4, retval |
| ',` mr r3, retval') |
| blr |
| EPILOGUE() |