blob: e8cec872956658df2f4ef67465ce8e9810744490 [file] [log] [blame]
Austin Schuhdace2a62020-08-18 10:56:48 -07001dnl IA-64 mpn_lshiftc.
2
3dnl Contributed to the GNU project by Torbjorn Granlund.
4
5dnl Copyright 2000-2005, 2010 Free Software Foundation, Inc.
6
7dnl This file is part of the GNU MP Library.
8dnl
9dnl The GNU MP Library is free software; you can redistribute it and/or modify
10dnl it under the terms of either:
11dnl
12dnl * the GNU Lesser General Public License as published by the Free
13dnl Software Foundation; either version 3 of the License, or (at your
14dnl option) any later version.
15dnl
16dnl or
17dnl
18dnl * the GNU General Public License as published by the Free Software
19dnl Foundation; either version 2 of the License, or (at your option) any
20dnl later version.
21dnl
22dnl or both in parallel, as here.
23dnl
24dnl The GNU MP Library is distributed in the hope that it will be useful, but
25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27dnl for more details.
28dnl
29dnl You should have received copies of the GNU General Public License and the
30dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31dnl see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C cycles/limb
36C Itanium: ?
37C Itanium 2: 1.25
38
39C This code is scheduled deeply since the plain shift instructions shr and shl
40C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
41C these instructions cause a 10 cycle replay trap on Itanium.
42
43C The ld8 scheduling should probably be decreased to make the function smaller.
44C Good lfetch will make sure we never stall anyway.
45
46C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
47C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
48C in the prologue.
49
50
51C INPUT PARAMETERS
52define(`rp', `r32')
53define(`up', `r33')
54define(`n', `r34')
55define(`cnt',`r35')
56
57define(`tnc',`r9')
58
59define(`FSH',`shl')
60define(`BSH',`shr.u')
61define(`UPD',`-8')
62define(`POFF',`-512')
63define(`PUPD',`-32')
64define(`func',`mpn_lshiftc')
65
66ASM_START()
67PROLOGUE(mpn_lshiftc)
68 .prologue
69 .save ar.lc, r2
70 .body
71ifdef(`HAVE_ABI_32',
72` addp4 rp = 0, rp C M I
73 addp4 up = 0, up C M I
74 sxt4 n = n C M I
75 nop.m 0
76 nop.m 0
77 zxt4 cnt = cnt C I
78 ;;
79')
80
81 {.mmi; nop 0 C M I
82 and r14 = 3, n C M I
83 mov.i r2 = ar.lc C I0
84}{.mmi; add r15 = -1, n C M I
85 sub tnc = 64, cnt C M I
86 nop 0
87 ;;
88}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
89 cmp.eq p7, p0 = 2, r14 C M I
90 shr.u n = r15, 2 C I0
91}{.mmi; cmp.eq p8, p0 = 3, r14 C M I
92 shladd up = r15, 3, up C M I
93 shladd rp = r15, 3, rp C M I
94 ;;
95}{.mmi; add r11 = POFF, up C M I
96 ld8 r10 = [up], UPD C M01
97 mov.i ar.lc = n C I0
98}{.bbb;
99 (p6) br.dptk .Lb01
100 (p7) br.dptk .Lb10
101 (p8) br.dptk .Lb11
102 ;; }
103
104.Lb00:
105 ld8 r19 = [up], UPD
106 ;;
107 ld8 r16 = [up], UPD
108 ;;
109 ld8 r17 = [up], UPD
110 BSH r8 = r10, tnc
111 br.cloop.dptk L(gt4)
112 ;;
113 FSH r24 = r10, cnt
114 BSH r25 = r19, tnc
115 ;;
116 FSH r26 = r19, cnt
117 BSH r27 = r16, tnc
118 ;;
119 FSH r20 = r16, cnt
120 BSH r21 = r17, tnc
121 ;;
122 or r14 = r25, r24
123 FSH r22 = r17, cnt
124 ;;
125 or r15 = r27, r26
126 sub r31 = -1, r14
127 br .Lr4
128
129L(gt4):
130 {.mmi; nop 0
131 nop 0
132 FSH r24 = r10, cnt
133}{.mmi; ld8 r18 = [up], UPD
134 nop 0
135 BSH r25 = r19, tnc
136 ;; }
137 {.mmi; nop 0
138 nop 0
139 FSH r26 = r19, cnt
140}{.mmi; ld8 r19 = [up], UPD
141 nop 0
142 BSH r27 = r16, tnc
143 ;; }
144 {.mmi; nop 0
145 nop 0
146 FSH r20 = r16, cnt
147}{.mmi; ld8 r16 = [up], UPD
148 nop 0
149 BSH r21 = r17, tnc
150 ;; }
151 {.mmi; nop 0
152 or r14 = r25, r24
153 FSH r22 = r17, cnt
154}{.mib; ld8 r17 = [up], UPD
155 BSH r23 = r18, tnc
156 br.cloop.dptk L(gt8)
157 ;; }
158 {.mmi; nop 0
159 or r15 = r27, r26
160 FSH r24 = r18, cnt
161}{.mib; sub r31 = -1, r14
162 BSH r25 = r19, tnc
163 br .Lr8 }
164
165L(gt8):
166 or r15 = r27, r26
167 FSH r24 = r18, cnt
168 ld8 r18 = [up], UPD
169 sub r31 = -1, r14
170 BSH r25 = r19, tnc
171 br .LL00
172
173.Lb01:
174 br.cloop.dptk L(gt1)
175 ;;
176 BSH r8 = r10, tnc
177 FSH r22 = r10, cnt
178 ;;
179 sub r31 = -1, r22
180 br .Lr1
181 ;;
182L(gt1):
183 ld8 r18 = [up], UPD
184 BSH r8 = r10, tnc
185 FSH r22 = r10, cnt
186 ;;
187 ld8 r19 = [up], UPD
188 ;;
189 ld8 r16 = [up], UPD
190 ;;
191 ld8 r17 = [up], UPD
192 BSH r23 = r18, tnc
193 br.cloop.dptk L(gt5)
194 ;;
195 nop 0
196 FSH r24 = r18, cnt
197 BSH r25 = r19, tnc
198 ;;
199 nop 0
200 FSH r26 = r19, cnt
201 BSH r27 = r16, tnc
202 ;;
203 or r15 = r23, r22
204 FSH r20 = r16, cnt
205 BSH r21 = r17, tnc
206 ;;
207 or r14 = r25, r24
208 FSH r22 = r17, cnt
209 sub r31 = -1, r15
210 br .Lr5
211
212L(gt5):
213 {.mmi; nop 0
214 nop 0
215 FSH r24 = r18, cnt
216}{.mmi; ld8 r18 = [up], UPD
217 nop 0
218 BSH r25 = r19, tnc
219 ;; }
220 {.mmi; nop 0
221 nop 0
222 FSH r26 = r19, cnt
223}{.mmi; ld8 r19 = [up], UPD
224 nop 0
225 BSH r27 = r16, tnc
226 ;; }
227 {.mmi; nop 0
228 or r15 = r23, r22
229 FSH r20 = r16, cnt
230}{.mmi; ld8 r16 = [up], UPD
231 nop 0
232 BSH r21 = r17, tnc
233 ;; }
234 {.mmi; or r14 = r25, r24
235 sub r31 = -1, r15
236 FSH r22 = r17, cnt
237}{.mib; ld8 r17 = [up], UPD
238 BSH r23 = r18, tnc
239 br L(end)
240 ;; }
241
242.Lb10:
243 ld8 r17 = [up], UPD
244 br.cloop.dptk L(gt2)
245 ;;
246 BSH r8 = r10, tnc
247 FSH r20 = r10, cnt
248 ;;
249 BSH r21 = r17, tnc
250 FSH r22 = r17, cnt
251 ;;
252 or r14 = r21, r20
253 ;;
254 sub r31 = -1, r14
255 br .Lr2
256 ;;
257L(gt2):
258 ld8 r18 = [up], UPD
259 BSH r8 = r10, tnc
260 FSH r20 = r10, cnt
261 ;;
262 ld8 r19 = [up], UPD
263 ;;
264 ld8 r16 = [up], UPD
265 BSH r21 = r17, tnc
266 FSH r22 = r17, cnt
267 ;;
268 ld8 r17 = [up], UPD
269 BSH r23 = r18, tnc
270 br.cloop.dptk L(gt6)
271 ;;
272 nop 0
273 FSH r24 = r18, cnt
274 BSH r25 = r19, tnc
275 ;;
276 or r14 = r21, r20
277 FSH r26 = r19, cnt
278 BSH r27 = r16, tnc
279 ;;
280 {.mmi; nop 0
281 or r15 = r23, r22
282 FSH r20 = r16, cnt
283}{.mib; sub r31 = -1, r14
284 BSH r21 = r17, tnc
285 br .Lr6
286 ;; }
287L(gt6):
288 {.mmi; nop 0
289 nop 0
290 FSH r24 = r18, cnt
291}{.mmi; ld8 r18 = [up], UPD
292 nop 0
293 BSH r25 = r19, tnc
294 ;; }
295 {.mmi; nop 0
296 or r14 = r21, r20
297 FSH r26 = r19, cnt
298}{.mmi; ld8 r19 = [up], UPD
299 nop 0
300 BSH r27 = r16, tnc
301 ;; }
302 {.mmi; or r15 = r23, r22
303 sub r31 = -1, r14
304 FSH r20 = r16, cnt
305}{.mib; ld8 r16 = [up], UPD
306 BSH r21 = r17, tnc
307 br .LL10
308}
309
310.Lb11:
311 ld8 r16 = [up], UPD
312 ;;
313 ld8 r17 = [up], UPD
314 BSH r8 = r10, tnc
315 FSH r26 = r10, cnt
316 br.cloop.dptk L(gt3)
317 ;;
318 BSH r27 = r16, tnc
319 ;;
320 FSH r20 = r16, cnt
321 BSH r21 = r17, tnc
322 ;;
323 FSH r22 = r17, cnt
324 ;;
325 or r15 = r27, r26
326 ;;
327 or r14 = r21, r20
328 sub r31 = -1, r15
329 br .Lr3
330 ;;
331L(gt3):
332 ld8 r18 = [up], UPD
333 ;;
334 ld8 r19 = [up], UPD
335 BSH r27 = r16, tnc
336 ;;
337 {.mmi; nop 0
338 nop 0
339 FSH r20 = r16, cnt
340}{.mmi; ld8 r16 = [up], UPD
341 nop 0
342 BSH r21 = r17, tnc
343 ;;
344}{.mmi; nop 0
345 nop 0
346 FSH r22 = r17, cnt
347}{.mib; ld8 r17 = [up], UPD
348 BSH r23 = r18, tnc
349 br.cloop.dptk L(gt7)
350 ;; }
351 or r15 = r27, r26
352 FSH r24 = r18, cnt
353 BSH r25 = r19, tnc
354 ;;
355 {.mmi; nop 0
356 or r14 = r21, r20
357 FSH r26 = r19, cnt
358}{.mib; sub r31 = -1, r15
359 BSH r27 = r16, tnc
360 br .Lr7
361}
362L(gt7):
363 {.mmi; nop 0
364 or r15 = r27, r26
365 FSH r24 = r18, cnt
366}{.mmi; ld8 r18 = [up], UPD
367 nop 0
368 BSH r25 = r19, tnc
369 ;; }
370 {.mmi; or r14 = r21, r20
371 sub r31 = -1, r15
372 FSH r26 = r19, cnt
373}{.mib; ld8 r19 = [up], UPD
374 BSH r27 = r16, tnc
375 br .LL11
376}
377
378C *** MAIN LOOP START ***
379 ALIGN(32)
380L(top):
381.LL01:
382 {.mmi; st8 [rp] = r31, UPD C M2
383 or r15 = r27, r26 C M3
384 FSH r24 = r18, cnt C I0
385}{.mmi; ld8 r18 = [up], UPD C M0
386 sub r31 = -1, r14 C M1
387 BSH r25 = r19, tnc C I1
388 ;; }
389.LL00:
390 {.mmi; st8 [rp] = r31, UPD
391 or r14 = r21, r20
392 FSH r26 = r19, cnt
393}{.mmi; ld8 r19 = [up], UPD
394 sub r31 = -1, r15
395 BSH r27 = r16, tnc
396 ;; }
397.LL11:
398 {.mmi; st8 [rp] = r31, UPD
399 or r15 = r23, r22
400 FSH r20 = r16, cnt
401}{.mmi; ld8 r16 = [up], UPD
402 sub r31 = -1, r14
403 BSH r21 = r17, tnc
404 ;; }
405.LL10:
406 {.mmi; st8 [rp] = r31, UPD
407 or r14 = r25, r24
408 FSH r22 = r17, cnt
409}{.mmi; ld8 r17 = [up], UPD
410 sub r31 = -1, r15
411 BSH r23 = r18, tnc
412 ;; }
413L(end): lfetch [r11], PUPD
414 br.cloop.dptk L(top)
415C *** MAIN LOOP END ***
416
417 {.mmi; st8 [rp] = r31, UPD
418 or r15 = r27, r26
419 FSH r24 = r18, cnt
420}{.mib; sub r31 = -1, r14
421 BSH r25 = r19, tnc
422 nop 0
423 ;; }
424.Lr8:
425 {.mmi; st8 [rp] = r31, UPD
426 or r14 = r21, r20
427 FSH r26 = r19, cnt
428}{.mib; sub r31 = -1, r15
429 BSH r27 = r16, tnc
430 nop 0
431 ;; }
432.Lr7:
433 {.mmi; st8 [rp] = r31, UPD
434 or r15 = r23, r22
435 FSH r20 = r16, cnt
436}{.mib; sub r31 = -1, r14
437 BSH r21 = r17, tnc
438 nop 0
439 ;; }
440.Lr6: st8 [rp] = r31, UPD
441 or r14 = r25, r24
442 FSH r22 = r17, cnt
443 sub r31 = -1, r15
444 ;;
445.Lr5: st8 [rp] = r31, UPD
446 or r15 = r27, r26
447 sub r31 = -1, r14
448 ;;
449.Lr4: st8 [rp] = r31, UPD
450 or r14 = r21, r20
451 sub r31 = -1, r15
452 ;;
453.Lr3: st8 [rp] = r31, UPD
454 sub r31 = -1, r14
455 ;;
456.Lr2: st8 [rp] = r31, UPD
457 sub r31 = -1, r22
458 ;;
459.Lr1: st8 [rp] = r31, UPD C M23
460 mov ar.lc = r2 C I0
461 br.ret.sptk.many b0 C B
462EPILOGUE(func)
463ASM_END()