Austin Schuh | dace2a6 | 2020-08-18 10:56:48 -0700 | [diff] [blame] | 1 | dnl IA-64 mpn_popcount -- mpn population count. |
| 2 | |
| 3 | dnl Contributed to the GNU project by Torbjorn Granlund. |
| 4 | |
| 5 | dnl Copyright 2000-2005 Free Software Foundation, Inc. |
| 6 | |
| 7 | dnl This file is part of the GNU MP Library. |
| 8 | dnl |
| 9 | dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| 10 | dnl it under the terms of either: |
| 11 | dnl |
| 12 | dnl * the GNU Lesser General Public License as published by the Free |
| 13 | dnl Software Foundation; either version 3 of the License, or (at your |
| 14 | dnl option) any later version. |
| 15 | dnl |
| 16 | dnl or |
| 17 | dnl |
| 18 | dnl * the GNU General Public License as published by the Free Software |
| 19 | dnl Foundation; either version 2 of the License, or (at your option) any |
| 20 | dnl later version. |
| 21 | dnl |
| 22 | dnl or both in parallel, as here. |
| 23 | dnl |
| 24 | dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| 25 | dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| 26 | dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 27 | dnl for more details. |
| 28 | dnl |
| 29 | dnl You should have received copies of the GNU General Public License and the |
| 30 | dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| 31 | dnl see https://www.gnu.org/licenses/. |
| 32 | |
| 33 | include(`../config.m4') |
| 34 | |
| 35 | C cycles/limb |
| 36 | C Itanium: 1.5 |
| 37 | C Itanium 2: 1 |
| 38 | |
| 39 | C INPUT PARAMETERS |
| 40 | define(`up', `r32') |
| 41 | define(`n', `r33') |
| 42 | |
| 43 | define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') |
| 44 | define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') |
| 45 | define(`s',`r8') |
| 46 | |
| 47 | |
| 48 | ASM_START() |
| 49 | PROLOGUE(mpn_popcount) |
| 50 | .prologue |
| 51 | ifdef(`HAVE_ABI_32', |
| 52 | ` addp4 up = 0, up C M I |
| 53 | nop.m 0 |
| 54 | zxt4 n = n C I |
| 55 | ;; |
| 56 | ') |
| 57 | |
| 58 | {.mmi; add r9 = 512, up C prefetch pointer M I |
| 59 | ld8 r10 = [up], 8 C load first limb M01 |
| 60 | mov.i r2 = ar.lc C save ar.lc I0 |
| 61 | }{.mmi; and r14 = 3, n C M I |
| 62 | cmp.lt p15, p14 = 4, n C small count? M I |
| 63 | add n = -5, n C M I |
| 64 | ;; |
| 65 | }{.mmi; cmp.eq p6, p0 = 1, r14 C M I |
| 66 | cmp.eq p7, p0 = 2, r14 C M I |
| 67 | cmp.eq p8, p0 = 3, r14 C M I |
| 68 | }{.bbb |
| 69 | (p6) br.dptk .Lb01 C B |
| 70 | (p7) br.dptk .Lb10 C B |
| 71 | (p8) br.dptk .Lb11 C B |
| 72 | } |
| 73 | |
| 74 | |
| 75 | .Lb00: ld8 u1 = [up], 8 C M01 |
| 76 | shr.u n = n, 2 C I0 |
| 77 | mov s = 0 C M I |
| 78 | ;; |
| 79 | ld8 u2 = [up], 8 C M01 |
| 80 | popcnt c0 = r10 C I0 |
| 81 | mov.i ar.lc = n C I0 |
| 82 | ;; |
| 83 | ld8 u3 = [up], 8 C M01 |
| 84 | popcnt c1 = u1 C I0 |
| 85 | (p15) br.cond.dptk .grt4 C B |
| 86 | ;; |
| 87 | nop.m 0 C - |
| 88 | nop.m 0 C - |
| 89 | popcnt c2 = u2 C I0 |
| 90 | ;; |
| 91 | mov s = c0 C M I |
| 92 | popcnt c3 = u3 C I0 |
| 93 | br .Lcj4 C B |
| 94 | |
| 95 | .grt4: ld8 u0 = [up], 8 C M01 |
| 96 | popcnt c2 = u2 C I0 |
| 97 | br .LL00 C B |
| 98 | |
| 99 | |
| 100 | .Lb01: |
| 101 | popcnt s = r10 C I0 |
| 102 | (p14) br.ret.sptk.many b0 C B |
| 103 | |
| 104 | .grt1: ld8 u0 = [up], 8 C M01 |
| 105 | shr.u n = n, 2 C I0 |
| 106 | ;; |
| 107 | ld8 u1 = [up], 8 C M01 |
| 108 | mov.i ar.lc = n C I0 |
| 109 | ;; |
| 110 | ld8 u2 = [up], 8 C M01 |
| 111 | popcnt c0 = u0 C I0 |
| 112 | mov c3 = 0 C I0 |
| 113 | |
| 114 | ;; |
| 115 | ld8 u3 = [up], 8 C M01 |
| 116 | popcnt c1 = u1 C I0 |
| 117 | br.cloop.dptk .Loop C B |
| 118 | br .Lend C B |
| 119 | |
| 120 | |
| 121 | .Lb10: ld8 u3 = [up], 8 C M01 |
| 122 | shr.u n = n, 2 C I0 |
| 123 | (p15) br.cond.dptk .grt2 C B |
| 124 | |
| 125 | popcnt s = r10 C I0 |
| 126 | ;; |
| 127 | popcnt c3 = u3 C I0 |
| 128 | br .Lcj2 C B |
| 129 | |
| 130 | .grt2: ld8 u0 = [up], 8 C M01 |
| 131 | mov.i ar.lc = n C I0 |
| 132 | popcnt c2 = r10 C I0 |
| 133 | ;; |
| 134 | ld8 u1 = [up], 8 C M01 |
| 135 | popcnt c3 = u3 C I0 |
| 136 | mov s = 0 C M I |
| 137 | ;; |
| 138 | ld8 u2 = [up], 8 C M01 |
| 139 | popcnt c0 = u0 C I0 |
| 140 | br .LL10 C B |
| 141 | |
| 142 | |
| 143 | .Lb11: ld8 u2 = [up], 8 C M01 |
| 144 | shr.u n = n, 2 C I0 |
| 145 | mov s = 0 C M I |
| 146 | ;; |
| 147 | ld8 u3 = [up], 8 C M01 |
| 148 | popcnt s = r10 C I0 |
| 149 | (p15) br.cond.dptk .grt3 C B |
| 150 | |
| 151 | popcnt c2 = u2 C I0 |
| 152 | ;; |
| 153 | popcnt c3 = u3 C I0 |
| 154 | br .Lcj3 C B |
| 155 | |
| 156 | .grt3: ld8 u0 = [up], 8 C M01 |
| 157 | popcnt c2 = u2 C I0 |
| 158 | mov.i ar.lc = n C I0 |
| 159 | mov c1 = 0 |
| 160 | ;; |
| 161 | ld8 u1 = [up], 8 C M01 |
| 162 | popcnt c3 = u3 C I0 |
| 163 | br .LL11 C B |
| 164 | |
| 165 | |
| 166 | .Loop: ld8 u0 = [up], 8 C M01 |
| 167 | popcnt c2 = u2 C I0 |
| 168 | add s = s, c3 C M I |
| 169 | ;; |
| 170 | .LL00: ld8 u1 = [up], 8 C M01 |
| 171 | popcnt c3 = u3 C I0 |
| 172 | add s = s, c0 C M I |
| 173 | ;; |
| 174 | .LL11: ld8 u2 = [up], 8 C M01 |
| 175 | popcnt c0 = u0 C I0 |
| 176 | add s = s, c1 C M I |
| 177 | ;; |
| 178 | .LL10: ld8 u3 = [up], 8 C M01 |
| 179 | popcnt c1 = u1 C I0 |
| 180 | add s = s, c2 C M I |
| 181 | lfetch [r9], 32 C M01 |
| 182 | nop.m 0 C - |
| 183 | br.cloop.dptk .Loop C B |
| 184 | ;; |
| 185 | |
| 186 | .Lend: popcnt c2 = u2 C I0 |
| 187 | add s = s, c3 C M I |
| 188 | ;; |
| 189 | popcnt c3 = u3 C I0 |
| 190 | add s = s, c0 C M I |
| 191 | ;; |
| 192 | .Lcj4: add s = s, c1 C M I |
| 193 | ;; |
| 194 | .Lcj3: add s = s, c2 C M I |
| 195 | ;; |
| 196 | .Lcj2: add s = s, c3 C M I |
| 197 | mov.i ar.lc = r2 C I0 |
| 198 | br.ret.sptk.many b0 C B |
| 199 | EPILOGUE() |
| 200 | ASM_END() |