blob: 7cf6a83428c59b2e4dc2f673845909be6c2f1886 [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
2
3dnl Copyright 2003, 2005, 2010 Free Software Foundation, Inc.
4
5dnl This file is part of the GNU MP Library.
6dnl
7dnl The GNU MP Library is free software; you can redistribute it and/or modify
8dnl it under the terms of either:
9dnl
10dnl * the GNU Lesser General Public License as published by the Free
11dnl Software Foundation; either version 3 of the License, or (at your
12dnl option) any later version.
13dnl
14dnl or
15dnl
16dnl * the GNU General Public License as published by the Free Software
17dnl Foundation; either version 2 of the License, or (at your option) any
18dnl later version.
19dnl
20dnl or both in parallel, as here.
21dnl
22dnl The GNU MP Library is distributed in the hope that it will be useful, but
23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25dnl for more details.
26dnl
27dnl You should have received copies of the GNU General Public License and the
28dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29dnl see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C cycles/limb
34C POWER3/PPC630 ?
35C POWER4/PPC970 ?
36C POWER5 2.25
37C POWER6 9.5
38C POWER7 2.15
39
40C TODO
41C * Try to reduce the number of needed live registers
42C * Micro-optimise header code
43C * Keep in synch with lshift.asm and rshift.asm
44C * Could the long-scheduled std insns be less scheduled?
45
46C INPUT PARAMETERS
47define(`rp', `r3')
48define(`up', `r4')
49define(`n', `r5')
50define(`cnt', `r6')
51
52define(`tnc',`r0')
53define(`u0',`r30')
54define(`u1',`r31')
55define(`retval',`r5')
56
57ASM_START()
58PROLOGUE(mpn_lshiftc)
59 std r31, -8(r1)
60 std r30, -16(r1)
61 subfic tnc, cnt, 64
62 sldi r7, n, 3 C byte count corresponding to n
63 add up, up, r7 C up = up + n
64 add rp, rp, r7 C rp = rp + n
65 rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
66 cmpdi cr6, r30, 2
67 addi r31, n, 3 C compute count...
68 ld r10, -8(up) C load 1st limb for b00...b11
69 srd retval, r10, tnc
70 srdi r31, r31, 2 C ...for ctr
71 mtctr r31 C copy count into ctr
72 beq cr0, L(b00)
73 blt cr6, L(b01)
74 ld r11, -16(up) C load 2nd limb for b10 and b11
75 beq cr6, L(b10)
76
77 ALIGN(16)
78L(b11): sld r8, r10, cnt
79 srd r9, r11, tnc
80 ld u1, -24(up)
81 addi up, up, -24
82 sld r12, r11, cnt
83 srd r7, u1, tnc
84 addi rp, rp, 16
85 bdnz L(gt3)
86
87 nor r11, r8, r9
88 sld r8, u1, cnt
89 nor r8, r8, r8
90 b L(cj3)
91
92 ALIGN(16)
93L(gt3): ld u0, -8(up)
94 nor r11, r8, r9
95 sld r8, u1, cnt
96 srd r9, u0, tnc
97 ld u1, -16(up)
98 nor r10, r12, r7
99 b L(L11)
100
101 ALIGN(32)
102L(b10): sld r12, r10, cnt
103 addi rp, rp, 24
104 srd r7, r11, tnc
105 bdnz L(gt2)
106
107 sld r8, r11, cnt
108 nor r10, r12, r7
109 nor r8, r8, r8
110 b L(cj2)
111
112L(gt2): ld u0, -24(up)
113 sld r8, r11, cnt
114 srd r9, u0, tnc
115 ld u1, -32(up)
116 nor r10, r12, r7
117 sld r12, u0, cnt
118 srd r7, u1, tnc
119 ld u0, -40(up)
120 nor r11, r8, r9
121 addi up, up, -16
122 b L(L10)
123
124 ALIGN(16)
125L(b00): ld u1, -16(up)
126 sld r12, r10, cnt
127 srd r7, u1, tnc
128 ld u0, -24(up)
129 sld r8, u1, cnt
130 srd r9, u0, tnc
131 ld u1, -32(up)
132 nor r10, r12, r7
133 sld r12, u0, cnt
134 srd r7, u1, tnc
135 addi rp, rp, 8
136 bdz L(cj4)
137
138L(gt4): addi up, up, -32
139 ld u0, -8(up)
140 nor r11, r8, r9
141 b L(L00)
142
143 ALIGN(16)
144L(b01): bdnz L(gt1)
145 sld r8, r10, cnt
146 nor r8, r8, r8
147 std r8, -8(rp)
148 b L(ret)
149
150L(gt1): ld u0, -16(up)
151 sld r8, r10, cnt
152 srd r9, u0, tnc
153 ld u1, -24(up)
154 sld r12, u0, cnt
155 srd r7, u1, tnc
156 ld u0, -32(up)
157 nor r11, r8, r9
158 sld r8, u1, cnt
159 srd r9, u0, tnc
160 ld u1, -40(up)
161 addi up, up, -40
162 nor r10, r12, r7
163 bdz L(end)
164
165 ALIGN(32)
166L(top): sld r12, u0, cnt
167 srd r7, u1, tnc
168 ld u0, -8(up)
169 std r11, -8(rp)
170 nor r11, r8, r9
171L(L00): sld r8, u1, cnt
172 srd r9, u0, tnc
173 ld u1, -16(up)
174 std r10, -16(rp)
175 nor r10, r12, r7
176L(L11): sld r12, u0, cnt
177 srd r7, u1, tnc
178 ld u0, -24(up)
179 std r11, -24(rp)
180 nor r11, r8, r9
181L(L10): sld r8, u1, cnt
182 srd r9, u0, tnc
183 ld u1, -32(up)
184 addi up, up, -32
185 std r10, -32(rp)
186 addi rp, rp, -32
187 nor r10, r12, r7
188 bdnz L(top)
189
190 ALIGN(32)
191L(end): sld r12, u0, cnt
192 srd r7, u1, tnc
193 std r11, -8(rp)
194L(cj4): nor r11, r8, r9
195 sld r8, u1, cnt
196 std r10, -16(rp)
197 nor r8, r8, r8
198L(cj3): nor r10, r12, r7
199 std r11, -24(rp)
200L(cj2): std r10, -32(rp)
201 std r8, -40(rp)
202
203L(ret): ld r31, -8(r1)
204 ld r30, -16(r1)
205ifdef(`HAVE_ABI_mode32',
206` srdi r3, retval, 32
207 mr r4, retval
208',` mr r3, retval')
209 blr
210EPILOGUE()