blob: 6c8c0900bd2a9a53eeb1c84eea2642e407a9cb72 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#define STACKSIZE 11*16
30#define PROLOGUE \
31 sub sp, sp, #(11 * 16); \
32 stp d8, d9, [sp, #(0 * 16)]; \
33 stp d10, d11, [sp, #(1 * 16)]; \
34 stp d12, d13, [sp, #(2 * 16)]; \
35 stp d14, d15, [sp, #(3 * 16)]; \
36 stp x18, x19, [sp, #(4 * 16)]; \
37 stp x20, x21, [sp, #(5 * 16)]; \
38 stp x22, x23, [sp, #(6 * 16)]; \
39 stp x24, x25, [sp, #(7 * 16)]; \
40 stp x26, x27, [sp, #(8 * 16)]; \
41 stp x28, x29, [sp, #(9 * 16)]; \
42 str x30, [sp, #(10 * 16)];
43#define EPILOGUE \
44 ldp d8, d9, [sp, #(0 * 16)]; \
45 ldp d10, d11, [sp, #(1 * 16)]; \
46 ldp d12, d13, [sp, #(2 * 16)]; \
47 ldp d14, d15, [sp, #(3 * 16)]; \
48 ldp x18, x19, [sp, #(4 * 16)]; \
49 ldp x20, x21, [sp, #(5 * 16)]; \
50 ldp x22, x23, [sp, #(6 * 16)]; \
51 ldp x24, x25, [sp, #(7 * 16)]; \
52 ldp x26, x27, [sp, #(8 * 16)]; \
53 ldp x28, x29, [sp, #(9 * 16)]; \
54 ldr x30, [sp, #(10 * 16)]; \
55 add sp, sp, #(11 * 16);
56
57
58
59
60
61 .text
62
63
64
65
66
67// subroutine
68//
69// input arguments:
70// w8 <- k
71// x9 <- A
72// x10 <- sda
73// x11 <- B
74//
75// output arguments:
76
77#if MACRO_LEVEL>=2
78 .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
79#else
80 .align 4
81 .type inner_kernel_gemm_add_nt_8x8_lib4, %function
82inner_kernel_gemm_add_nt_8x8_lib4:
83#endif
84
85 // early return
86 cmp w8, #0
87 ble 2f // return
88
89 add x13, x9, x10
90 add x14, x11, x12
91
92 // prefetch
93 prfm PLDL1KEEP, [x11, #0]
94 prfm PLDL1KEEP, [x9, #0]
95 prfm PLDL1KEEP, [x13, #0]
96 prfm PLDL1KEEP, [x14, #0]
97
98 // preload
99 ld1 {v24.4s, v25.4s}, [x9], #32
100 ld1 {v28.4s, v29.4s}, [x11], #32
101 ld1 {v20.4s, v21.4s}, [x13], #32
102 ld1 {v16.4s, v17.4s}, [x14], #32
103
104 cmp w8, #4
105 ble 0f // consider clean up loop
106
107 // prefetch
108 prfm PLDL1KEEP, [x11, #32]
109 prfm PLDL1KEEP, [x9, #32]
110 prfm PLDL1KEEP, [x13, #32]
111 prfm PLDL1KEEP, [x14, #32]
112
113 // main loop
1141:
115
116 // unroll 0
117 ld1 {v26.4s}, [x9], #16
118 fmla v0.4s, v24.4s, v28.4s[0]
119 fmla v1.4s, v24.4s, v28.4s[1]
120 ld1 {v27.4s}, [x9], #16
121 fmla v2.4s, v24.4s, v28.4s[2]
122 fmla v3.4s, v24.4s, v28.4s[3]
123 ld1 {v30.4s}, [x11], #16
124 fmla v4.4s, v20.4s, v28.4s[0]
125 fmla v5.4s, v20.4s, v28.4s[1]
126 ld1 {v31.4s}, [x11], #16
127 fmla v6.4s, v20.4s, v28.4s[2]
128 fmla v7.4s, v20.4s, v28.4s[3]
129 ld1 {v22.4s}, [x13], #16
130 fmla v8.4s, v24.4s, v16.4s[0]
131 fmla v9.4s, v24.4s, v16.4s[1]
132 ld1 {v23.4s}, [x13], #16
133 fmla v10.4s, v24.4s, v16.4s[2]
134 fmla v11.4s, v24.4s, v16.4s[3]
135 ld1 {v18.4s}, [x14], #16
136 fmla v12.4s, v20.4s, v16.4s[0]
137 fmla v13.4s, v20.4s, v16.4s[1]
138 ld1 {v19.4s}, [x14], #16
139 fmla v14.4s, v20.4s, v16.4s[2]
140 fmla v15.4s, v20.4s, v16.4s[3]
141
142 // unroll 1
143 prfm PLDL1KEEP, [x11, #64]
144 fmla v0.4s, v25.4s, v29.4s[0]
145 fmla v1.4s, v25.4s, v29.4s[1]
146 prfm PLDL1KEEP, [x9, #64]
147 fmla v2.4s, v25.4s, v29.4s[2]
148 fmla v3.4s, v25.4s, v29.4s[3]
149 prfm PLDL1KEEP, [x13, #64]
150 fmla v4.4s, v21.4s, v29.4s[0]
151 fmla v5.4s, v21.4s, v29.4s[1]
152 prfm PLDL1KEEP, [x14, #64]
153 fmla v6.4s, v21.4s, v29.4s[2]
154 fmla v7.4s, v21.4s, v29.4s[3]
155 sub w8, w8, #4
156 fmla v8.4s, v25.4s, v17.4s[0]
157 fmla v9.4s, v25.4s, v17.4s[1]
158 fmla v10.4s, v25.4s, v17.4s[2]
159 fmla v11.4s, v25.4s, v17.4s[3]
160 fmla v12.4s, v21.4s, v17.4s[0]
161 fmla v13.4s, v21.4s, v17.4s[1]
162 cmp w8, #4
163 fmla v14.4s, v21.4s, v17.4s[2]
164 fmla v15.4s, v21.4s, v17.4s[3]
165
166 // unroll 2
167 ld1 {v24.4s}, [x9], #16
168 fmla v0.4s, v26.4s, v30.4s[0]
169 fmla v1.4s, v26.4s, v30.4s[1]
170 ld1 {v25.4s}, [x9], #16
171 fmla v2.4s, v26.4s, v30.4s[2]
172 fmla v3.4s, v26.4s, v30.4s[3]
173 ld1 {v28.4s}, [x11], #16
174 fmla v4.4s, v22.4s, v30.4s[0]
175 fmla v5.4s, v22.4s, v30.4s[1]
176 ld1 {v29.4s}, [x11], #16
177 fmla v6.4s, v22.4s, v30.4s[2]
178 fmla v7.4s, v22.4s, v30.4s[3]
179 ld1 {v20.4s}, [x13], #16
180 fmla v8.4s, v26.4s, v18.4s[0]
181 fmla v9.4s, v26.4s, v18.4s[1]
182 ld1 {v21.4s}, [x13], #16
183 fmla v10.4s, v26.4s, v18.4s[2]
184 fmla v11.4s, v26.4s, v18.4s[3]
185 ld1 {v16.4s}, [x14], #16
186 fmla v12.4s, v22.4s, v18.4s[0]
187 fmla v13.4s, v22.4s, v18.4s[1]
188 ld1 {v17.4s}, [x14], #16
189 fmla v14.4s, v22.4s, v18.4s[2]
190 fmla v15.4s, v22.4s, v18.4s[3]
191
192 // unroll 3
193 fmla v0.4s, v27.4s, v31.4s[0]
194 fmla v1.4s, v27.4s, v31.4s[1]
195 fmla v2.4s, v27.4s, v31.4s[2]
196 fmla v3.4s, v27.4s, v31.4s[3]
197 fmla v4.4s, v23.4s, v31.4s[0]
198 fmla v5.4s, v23.4s, v31.4s[1]
199 fmla v6.4s, v23.4s, v31.4s[2]
200 fmla v7.4s, v23.4s, v31.4s[3]
201 fmla v8.4s, v27.4s, v19.4s[0]
202 fmla v9.4s, v27.4s, v19.4s[1]
203 fmla v10.4s, v27.4s, v19.4s[2]
204 fmla v11.4s, v27.4s, v19.4s[3]
205 fmla v12.4s, v23.4s, v19.4s[0]
206 fmla v13.4s, v23.4s, v19.4s[1]
207 fmla v14.4s, v23.4s, v19.4s[2]
208 fmla v15.4s, v23.4s, v19.4s[3]
209
210 bgt 1b
211
2120:
213
214 cmp w8, #3
215 ble 4f
216
217 // unroll 0
218 ld1 {v26.4s}, [x9], #16
219 fmla v0.4s, v24.4s, v28.4s[0]
220 fmla v1.4s, v24.4s, v28.4s[1]
221 ld1 {v27.4s}, [x9], #16
222 fmla v2.4s, v24.4s, v28.4s[2]
223 fmla v3.4s, v24.4s, v28.4s[3]
224 ld1 {v30.4s}, [x11], #16
225 fmla v4.4s, v20.4s, v28.4s[0]
226 fmla v5.4s, v20.4s, v28.4s[1]
227 ld1 {v31.4s}, [x11], #16
228 fmla v6.4s, v20.4s, v28.4s[2]
229 fmla v7.4s, v20.4s, v28.4s[3]
230 ld1 {v22.4s}, [x13], #16
231 fmla v8.4s, v24.4s, v16.4s[0]
232 fmla v9.4s, v24.4s, v16.4s[1]
233 ld1 {v23.4s}, [x13], #16
234 fmla v10.4s, v24.4s, v16.4s[2]
235 fmla v11.4s, v24.4s, v16.4s[3]
236 ld1 {v18.4s}, [x14], #16
237 fmla v12.4s, v20.4s, v16.4s[0]
238 fmla v13.4s, v20.4s, v16.4s[1]
239 ld1 {v19.4s}, [x14], #16
240 fmla v14.4s, v20.4s, v16.4s[2]
241 fmla v15.4s, v20.4s, v16.4s[3]
242
243 // unroll 1
244// prfm PLDL1KEEP, [x11, #64]
245 fmla v0.4s, v25.4s, v29.4s[0]
246 fmla v1.4s, v25.4s, v29.4s[1]
247// prfm PLDL1KEEP, [x9, #64]
248 fmla v2.4s, v25.4s, v29.4s[2]
249 fmla v3.4s, v25.4s, v29.4s[3]
250// prfm PLDL1KEEP, [x13, #64]
251 fmla v4.4s, v21.4s, v29.4s[0]
252 fmla v5.4s, v21.4s, v29.4s[1]
253// prfm PLDL1KEEP, [x14, #64]
254 fmla v6.4s, v21.4s, v29.4s[2]
255 fmla v7.4s, v21.4s, v29.4s[3]
256 sub w8, w8, #4
257 fmla v8.4s, v25.4s, v17.4s[0]
258 fmla v9.4s, v25.4s, v17.4s[1]
259 fmla v10.4s, v25.4s, v17.4s[2]
260 fmla v11.4s, v25.4s, v17.4s[3]
261 fmla v12.4s, v21.4s, v17.4s[0]
262 fmla v13.4s, v21.4s, v17.4s[1]
263 cmp w8, #4
264 fmla v14.4s, v21.4s, v17.4s[2]
265 fmla v15.4s, v21.4s, v17.4s[3]
266
267 // unroll 2
268// ld1 {v24.4s}, [x9], #16
269 fmla v0.4s, v26.4s, v30.4s[0]
270 fmla v1.4s, v26.4s, v30.4s[1]
271// ld1 {v25.4s}, [x9], #16
272 fmla v2.4s, v26.4s, v30.4s[2]
273 fmla v3.4s, v26.4s, v30.4s[3]
274// ld1 {v28.4s}, [x11], #16
275 fmla v4.4s, v22.4s, v30.4s[0]
276 fmla v5.4s, v22.4s, v30.4s[1]
277// ld1 {v29.4s}, [x11], #16
278 fmla v6.4s, v22.4s, v30.4s[2]
279 fmla v7.4s, v22.4s, v30.4s[3]
280// ld1 {v20.4s}, [x13], #16
281 fmla v8.4s, v26.4s, v18.4s[0]
282 fmla v9.4s, v26.4s, v18.4s[1]
283// ld1 {v21.4s}, [x13], #16
284 fmla v10.4s, v26.4s, v18.4s[2]
285 fmla v11.4s, v26.4s, v18.4s[3]
286// ld1 {v16.4s}, [x14], #16
287 fmla v12.4s, v22.4s, v18.4s[0]
288 fmla v13.4s, v22.4s, v18.4s[1]
289// ld1 {v17.4s}, [x14], #16
290 fmla v14.4s, v22.4s, v18.4s[2]
291 fmla v15.4s, v22.4s, v18.4s[3]
292
293 // unroll 3
294 fmla v0.4s, v27.4s, v31.4s[0]
295 fmla v1.4s, v27.4s, v31.4s[1]
296 fmla v2.4s, v27.4s, v31.4s[2]
297 fmla v3.4s, v27.4s, v31.4s[3]
298 fmla v4.4s, v23.4s, v31.4s[0]
299 fmla v5.4s, v23.4s, v31.4s[1]
300 fmla v6.4s, v23.4s, v31.4s[2]
301 fmla v7.4s, v23.4s, v31.4s[3]
302 fmla v8.4s, v27.4s, v19.4s[0]
303 fmla v9.4s, v27.4s, v19.4s[1]
304 fmla v10.4s, v27.4s, v19.4s[2]
305 fmla v11.4s, v27.4s, v19.4s[3]
306 fmla v12.4s, v23.4s, v19.4s[0]
307 fmla v13.4s, v23.4s, v19.4s[1]
308 fmla v14.4s, v23.4s, v19.4s[2]
309 fmla v15.4s, v23.4s, v19.4s[3]
310
311 b 2f // return
312
3134: // consider clean1-up loop
314
315 cmp w8, #0
316 ble 2f // return
317
318 sub x9, x9, #32
319 sub x13, x13, #32
320 sub x11, x11, #32
321 sub x14, x14, #32
322
3233: // clean1-up loop
324
325 // unroll 0
326
327 ld1 {v28.4s}, [x11], #16
328 ld1 {v24.4s}, [x9], #16
329 fmla v0.4s, v24.4s, v28.4s[0]
330 fmla v1.4s, v24.4s, v28.4s[1]
331 fmla v2.4s, v24.4s, v28.4s[2]
332 fmla v3.4s, v24.4s, v28.4s[3]
333 ld1 {v20.4s}, [x13], #16
334 fmla v4.4s, v20.4s, v28.4s[0]
335 fmla v5.4s, v20.4s, v28.4s[1]
336 fmla v6.4s, v20.4s, v28.4s[2]
337 fmla v7.4s, v20.4s, v28.4s[3]
338 ld1 {v16.4s}, [x14], #16
339 fmla v8.4s, v24.4s, v16.4s[0]
340 fmla v9.4s, v24.4s, v16.4s[1]
341 fmla v10.4s, v24.4s, v16.4s[2]
342 fmla v11.4s, v24.4s, v16.4s[3]
343 fmla v12.4s, v20.4s, v16.4s[0]
344 fmla v13.4s, v20.4s, v16.4s[1]
345 fmla v14.4s, v20.4s, v16.4s[2]
346 fmla v15.4s, v20.4s, v16.4s[3]
347
348 sub w8, w8, #1
349 cmp w8, #0
350 bgt 3b
351
3522: // return
353
354#if MACRO_LEVEL>=2
355 .endm
356#else
357 ret
358
359 .size inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
360#endif
361
362
363
364
365
366// subroutine
367//
368// input arguments:
369// x8 <- alpha
370// x9 <- beta
371// x10 <- C
372// x11 <- sdc
373//
374// output arguments:
375
376#if MACRO_LEVEL>=2
377 .macro INNER_SCALE_AB_8X8_LIB4
378#else
379 .align 4
380 .type inner_scale_ab_8x8_lib4, %function
381inner_scale_ab_8x8_lib4:
382#endif
383
384 ld1 {v28.4s}, [x8]
385
386 fmul v0.4s, v0.4s, v28.4s[0]
387 fmul v1.4s, v1.4s, v28.4s[0]
388 fmul v2.4s, v2.4s, v28.4s[0]
389 fmul v3.4s, v3.4s, v28.4s[0]
390 fmul v4.4s, v4.4s, v28.4s[0]
391 fmul v5.4s, v5.4s, v28.4s[0]
392 fmul v6.4s, v6.4s, v28.4s[0]
393 fmul v7.4s, v7.4s, v28.4s[0]
394 fmul v8.4s, v8.4s, v28.4s[0]
395 fmul v9.4s, v9.4s, v28.4s[0]
396 fmul v10.4s, v10.4s, v28.4s[0]
397 fmul v11.4s, v11.4s, v28.4s[0]
398 fmul v12.4s, v12.4s, v28.4s[0]
399 fmul v13.4s, v13.4s, v28.4s[0]
400 fmul v14.4s, v14.4s, v28.4s[0]
401 fmul v15.4s, v15.4s, v28.4s[0]
402
403 ld1 {v28.4s}, [x9]
404
405 add x12, x10, x11
406
407 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
408 fmla v0.4s, v24.4s, v28.4s[0]
409 fmla v1.4s, v25.4s, v28.4s[0]
410 fmla v2.4s, v26.4s, v28.4s[0]
411 fmla v3.4s, v27.4s, v28.4s[0]
412
413 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
414 fmla v4.4s, v24.4s, v28.4s[0]
415 fmla v5.4s, v25.4s, v28.4s[0]
416 fmla v6.4s, v26.4s, v28.4s[0]
417 fmla v7.4s, v27.4s, v28.4s[0]
418
419 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
420 fmla v8.4s, v24.4s, v28.4s[0]
421 fmla v9.4s, v25.4s, v28.4s[0]
422 fmla v10.4s, v26.4s, v28.4s[0]
423 fmla v11.4s, v27.4s, v28.4s[0]
424
425 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
426 fmla v12.4s, v24.4s, v28.4s[0]
427 fmla v13.4s, v25.4s, v28.4s[0]
428 fmla v14.4s, v26.4s, v28.4s[0]
429 fmla v15.4s, v27.4s, v28.4s[0]
430
431#if MACRO_LEVEL>=2
432 .endm
433#else
434 ret
435
436 .size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
437#endif
438
439
440
441
442
443// subroutine
444//
445// input arguments:
446// x8 <- D
447// x9 <- sdd
448//
449// output arguments:
450
451#if MACRO_LEVEL>=2
452 .macro INNER_STORE_8X8_LIB4
453#else
454 .align 4
455 .type inner_store_8x8_lib4, %function
456inner_store_8x8_lib4:
457#endif
458
459 add x10, x8, x9
460
461 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
462 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
463 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
464 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64
465
466#if MACRO_LEVEL>=2
467 .endm
468#else
469 ret
470
471 .size inner_store_8x8_lib4, .-inner_store_8x8_lib4
472#endif
473
474
475
476
477
478// w0 x1 x2 w3 x4 w5 x6 x7 sp+0 sp+8 sp+16
479// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd)
480
481 .align 4
482 .global kernel_sgemm_nt_8x8_lib4
483 .type kernel_sgemm_nt_8x8_lib4, %function
484kernel_sgemm_nt_8x8_lib4:
485
486
487
488 PROLOGUE
489
490
491
492 // TODO zero the entire 128-bit register ???
493 fmov d0, xzr
494 fmov d1, d0
495 fmov d2, d0
496 fmov d3, d0
497 fmov d4, d0
498 fmov d5, d0
499 fmov d6, d0
500 fmov d7, d0
501 fmov d8, d0
502 fmov d9, d0
503 fmov d10, d0
504 fmov d11, d0
505 fmov d12, d0
506 fmov d13, d0
507 fmov d14, d0
508 fmov d15, d0
509
510
511
512 // call inner kernel gemm nt
513 mov w8, w0 // kmax
514 mov x9, x2 // A
515 mov w10, w3 // sda
516 lsl w10, w10, #4 // 16*sda
517 mov x11, x4 // B
518 mov w12, w5 // sdb
519 lsl w12, w12, #4 // 16*sdb
520
521#if MACRO_LEVEL>=2
522 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
523#else
524 bl inner_kernel_gemm_add_nt_8x8_lib4
525#endif
526
527
528
529 // call inner blend for generic alpha and beta
530 mov x8, x1 // alpha
531 mov x9, x6 // beta
532 mov x10, x7 // C
533 ldr w11, [sp, #(STACKSIZE + 0)] // D
534 lsl w11, w11, #4 // 16*sdc
535
536#if MACRO_LEVEL>=1
537 INNER_SCALE_AB_8X8_LIB4
538#else
539 bl inner_scale_ab_8x8_lib4
540#endif
541
542
543
544 // store n
545 ldr x8, [sp, #(STACKSIZE + 8)] // D
546 ldr w9, [sp, #(STACKSIZE + 16)] // sdd
547 lsl w9, w9, #4 // 16*sdd
548
549#if MACRO_LEVEL>=1
550 INNER_STORE_8X8_LIB4
551#else
552 bl inner_store_8x8_lib4
553#endif
554
555
556
557 EPILOGUE
558
559 mov x0, #0
560
561 ret
562
563
564
565