blob: 016af72e725bdc3db18202bed92c014d04414f3d [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#define STACKSIZE 11*16
30#define PROLOGUE \
31 sub sp, sp, #(11 * 16); \
32 stp d8, d9, [sp, #(0 * 16)]; \
33 stp d10, d11, [sp, #(1 * 16)]; \
34 stp d12, d13, [sp, #(2 * 16)]; \
35 stp d14, d15, [sp, #(3 * 16)]; \
36 stp x18, x19, [sp, #(4 * 16)]; \
37 stp x20, x21, [sp, #(5 * 16)]; \
38 stp x22, x23, [sp, #(6 * 16)]; \
39 stp x24, x25, [sp, #(7 * 16)]; \
40 stp x26, x27, [sp, #(8 * 16)]; \
41 stp x28, x29, [sp, #(9 * 16)]; \
42 str x30, [sp, #(10 * 16)];
43#define EPILOGUE \
44 ldp d8, d9, [sp, #(0 * 16)]; \
45 ldp d10, d11, [sp, #(1 * 16)]; \
46 ldp d12, d13, [sp, #(2 * 16)]; \
47 ldp d14, d15, [sp, #(3 * 16)]; \
48 ldp x18, x19, [sp, #(4 * 16)]; \
49 ldp x20, x21, [sp, #(5 * 16)]; \
50 ldp x22, x23, [sp, #(6 * 16)]; \
51 ldp x24, x25, [sp, #(7 * 16)]; \
52 ldp x26, x27, [sp, #(8 * 16)]; \
53 ldp x28, x29, [sp, #(9 * 16)]; \
54 ldr x30, [sp, #(10 * 16)]; \
55 add sp, sp, #(11 * 16);
56
57
58
59
60
61 .text
62
63
64
65
66
67// subroutine
68//
69// input arguments:
70// w8 <- k
71// x9 <- A
72// x10 <- sda
73// x11 <- B
74//
75// output arguments:
76
77#if MACRO_LEVEL>=2
78 .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
79#else
80 .align 4
81 .type inner_kernel_gemm_add_nt_8x4_lib4, %function
82inner_kernel_gemm_add_nt_8x4_lib4:
83#endif
84
85 // early return
86 cmp w8, #0
87 ble 2f // return
88
89 add x12, x9, x10
90
91 // prefetch
92 prfm PLDL1KEEP, [x11, #0]
93 prfm PLDL1KEEP, [x9, #0]
94 prfm PLDL1KEEP, [x12, #0]
95
96 // preload
97 ld1 {v24.4s, v25.4s}, [x9], #32
98 ld1 {v28.4s, v29.4s}, [x11], #32
99 ld1 {v20.4s, v21.4s}, [x12], #32
100
101 cmp w8, #4
102 ble 0f // consider clean up loop
103
104 // prefetch
105 prfm PLDL1KEEP, [x11, #32]
106 prfm PLDL1KEEP, [x9, #32]
107 prfm PLDL1KEEP, [x12, #32]
108
109 // main loop
1101:
111
112 // unroll 0
113 fmla v0.4s, v24.4s, v28.4s[0]
114 ld1 {v26.4s, v27.4s}, [x9], #32
115 fmla v1.4s, v24.4s, v28.4s[1]
116 ld1 {v30.4s, v31.4s}, [x11], #32
117 fmla v2.4s, v24.4s, v28.4s[2]
118 ld1 {v22.4s, v23.4s}, [x12], #32
119 fmla v3.4s, v24.4s, v28.4s[3]
120 prfm PLDL1KEEP, [x11, #64]
121 fmla v4.4s, v20.4s, v28.4s[0]
122 prfm PLDL1KEEP, [x9, #64]
123 fmla v5.4s, v20.4s, v28.4s[1]
124 prfm PLDL1KEEP, [x12, #64]
125 fmla v6.4s, v20.4s, v28.4s[2]
126 fmla v7.4s, v20.4s, v28.4s[3]
127 sub w8, w8, #4
128
129 // unroll 1
130 fmla v0.4s, v25.4s, v29.4s[0]
131 fmla v1.4s, v25.4s, v29.4s[1]
132 fmla v2.4s, v25.4s, v29.4s[2]
133 fmla v3.4s, v25.4s, v29.4s[3]
134 fmla v4.4s, v21.4s, v29.4s[0]
135 fmla v5.4s, v21.4s, v29.4s[1]
136 fmla v6.4s, v21.4s, v29.4s[2]
137 fmla v7.4s, v21.4s, v29.4s[3]
138 cmp w8, #4
139
140 // unroll 2
141 fmla v0.4s, v26.4s, v30.4s[0]
142 ld1 {v24.4s, v25.4s}, [x9], #32
143 fmla v1.4s, v26.4s, v30.4s[1]
144 ld1 {v28.4s, v29.4s}, [x11], #32
145 fmla v2.4s, v26.4s, v30.4s[2]
146 ld1 {v20.4s, v21.4s}, [x12], #32
147 fmla v3.4s, v26.4s, v30.4s[3]
148 fmla v4.4s, v22.4s, v30.4s[0]
149 fmla v5.4s, v22.4s, v30.4s[1]
150 fmla v6.4s, v22.4s, v30.4s[2]
151 fmla v7.4s, v22.4s, v30.4s[3]
152
153 // unroll 3
154 fmla v0.4s, v27.4s, v31.4s[0]
155 fmla v1.4s, v27.4s, v31.4s[1]
156 fmla v2.4s, v27.4s, v31.4s[2]
157 fmla v3.4s, v27.4s, v31.4s[3]
158 fmla v4.4s, v23.4s, v31.4s[0]
159 fmla v5.4s, v23.4s, v31.4s[1]
160 fmla v6.4s, v23.4s, v31.4s[2]
161 fmla v7.4s, v23.4s, v31.4s[3]
162
163 bgt 1b
164
1650:
166
167 cmp w8, #3
168 ble 4f
169
170 // unroll 0
171 fmla v0.4s, v24.4s, v28.4s[0]
172 ld1 {v26.4s, v27.4s}, [x9], #32
173 fmla v1.4s, v24.4s, v28.4s[1]
174 ld1 {v30.4s, v31.4s}, [x11], #32
175 fmla v2.4s, v24.4s, v28.4s[2]
176 ld1 {v22.4s, v23.4s}, [x12], #32
177 fmla v3.4s, v24.4s, v28.4s[3]
178// prfm PLDL1KEEP, [x11, #64]
179 fmla v4.4s, v20.4s, v28.4s[0]
180// prfm PLDL1KEEP, [x9, #64]
181 fmla v5.4s, v20.4s, v28.4s[1]
182// prfm PLDL1KEEP, [x12, #64]
183 fmla v6.4s, v20.4s, v28.4s[2]
184 fmla v7.4s, v20.4s, v28.4s[3]
185 sub w8, w8, #4
186
187 // unroll 1
188 fmla v0.4s, v25.4s, v29.4s[0]
189 fmla v1.4s, v25.4s, v29.4s[1]
190 fmla v2.4s, v25.4s, v29.4s[2]
191 fmla v3.4s, v25.4s, v29.4s[3]
192 fmla v4.4s, v21.4s, v29.4s[0]
193 fmla v5.4s, v21.4s, v29.4s[1]
194 fmla v6.4s, v21.4s, v29.4s[2]
195 fmla v7.4s, v21.4s, v29.4s[3]
196// cmp w8, #4
197
198 // unroll 2
199 fmla v0.4s, v26.4s, v30.4s[0]
200// ld1 {v24.4s, v25.4s}, [x9], #32
201 fmla v1.4s, v26.4s, v30.4s[1]
202// ld1 {v28.4s, v29.4s}, [x11], #32
203 fmla v2.4s, v26.4s, v30.4s[2]
204// ld1 {v20.4s, v21.4s}, [x12], #32
205 fmla v3.4s, v26.4s, v30.4s[3]
206// ld1 {v16.4s, v17.4s}, [x13], #32
207 fmla v4.4s, v22.4s, v30.4s[0]
208 fmla v5.4s, v22.4s, v30.4s[1]
209 fmla v6.4s, v22.4s, v30.4s[2]
210 fmla v7.4s, v22.4s, v30.4s[3]
211
212 // unroll 3
213 fmla v0.4s, v27.4s, v31.4s[0]
214 fmla v1.4s, v27.4s, v31.4s[1]
215 fmla v2.4s, v27.4s, v31.4s[2]
216 fmla v3.4s, v27.4s, v31.4s[3]
217 fmla v4.4s, v23.4s, v31.4s[0]
218 fmla v5.4s, v23.4s, v31.4s[1]
219 fmla v6.4s, v23.4s, v31.4s[2]
220 fmla v7.4s, v23.4s, v31.4s[3]
221
222 b 2f // return
223
2244: // consider clean1-up loop
225
226 cmp w8, #0
227 ble 2f // return
228
229 sub x9, x9, #32
230 sub x12, x12, #32
231 sub x11, x11, #32
232
2333: // clean1-up loop
234
235 // unroll 0
236
237 ld1 {v28.4s}, [x11], #16
238 ld1 {v24.4s}, [x9], #16
239 fmla v0.4s, v24.4s, v28.4s[0]
240 fmla v1.4s, v24.4s, v28.4s[1]
241 fmla v2.4s, v24.4s, v28.4s[2]
242 fmla v3.4s, v24.4s, v28.4s[3]
243 ld1 {v20.4s}, [x12], #16
244 fmla v4.4s, v20.4s, v28.4s[0]
245 fmla v5.4s, v20.4s, v28.4s[1]
246 fmla v6.4s, v20.4s, v28.4s[2]
247 fmla v7.4s, v20.4s, v28.4s[3]
248
249 sub w8, w8, #1
250 cmp w8, #0
251 bgt 3b
252
2532: // return
254
255#if MACRO_LEVEL>=2
256 .endm
257#else
258 ret
259
260 .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
261#endif
262
263
264
265
266
267// subroutine
268//
269// input arguments:
270// x8 <- alpha
271// x9 <- beta
272// x10 <- C
273// x11 <- sdc
274//
275// output arguments:
276
277#if MACRO_LEVEL>=2
278 .macro INNER_SCALE_AB_8X4_LIB4
279#else
280 .align 4
281 .type inner_scale_ab_8x4_lib4, %function
282inner_scale_ab_8x4_lib4:
283#endif
284
285 ld1 {v28.4s}, [x8]
286
287 fmul v0.4s, v0.4s, v28.4s[0]
288 fmul v1.4s, v1.4s, v28.4s[0]
289 fmul v2.4s, v2.4s, v28.4s[0]
290 fmul v3.4s, v3.4s, v28.4s[0]
291 fmul v4.4s, v4.4s, v28.4s[0]
292 fmul v5.4s, v5.4s, v28.4s[0]
293 fmul v6.4s, v6.4s, v28.4s[0]
294 fmul v7.4s, v7.4s, v28.4s[0]
295
296 ld1 {v28.4s}, [x9]
297
298 add x12, x10, x11
299
300 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
301 fmla v0.4s, v24.4s, v28.4s[0]
302 fmla v1.4s, v25.4s, v28.4s[0]
303 fmla v2.4s, v26.4s, v28.4s[0]
304 fmla v3.4s, v27.4s, v28.4s[0]
305
306 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
307 fmla v4.4s, v24.4s, v28.4s[0]
308 fmla v5.4s, v25.4s, v28.4s[0]
309 fmla v6.4s, v26.4s, v28.4s[0]
310 fmla v7.4s, v27.4s, v28.4s[0]
311
312#if MACRO_LEVEL>=2
313 .endm
314#else
315 ret
316
317 .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
318#endif
319
320
321
322
323
324// subroutine
325//
326// input arguments:
327// x8 <- D
328// x9 <- sdd
329//
330// output arguments:
331
332#if MACRO_LEVEL>=2
333 .macro INNER_STORE_8X4_LIB4
334#else
335 .align 4
336 .type inner_store_8x4_lib4, %function
337inner_store_8x4_lib4:
338#endif
339
340 add x10, x8, x9
341
342 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
343 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
344
345#if MACRO_LEVEL>=2
346 .endm
347#else
348 ret
349
350 .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
351#endif
352
353
354
355
356
357// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
358// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
359
360 .align 4
361 .global kernel_sgemm_nt_8x4_lib4
362 .type kernel_sgemm_nt_8x4_lib4, %function
363kernel_sgemm_nt_8x4_lib4:
364
365
366
367 PROLOGUE
368
369
370
371 // TODO zero the entire 128-bit register ???
372 fmov d0, xzr
373 fmov d1, d0
374 fmov d2, d0
375 fmov d3, d0
376 fmov d4, d0
377 fmov d5, d0
378 fmov d6, d0
379 fmov d7, d0
380
381
382
383 // call inner kernel gemm nt
384 mov w8, w0 // kmax
385 mov x9, x2 // A
386 mov w10, w3 // sda
387 lsl w10, w10, #4 // 16*sda
388 mov x11, x4 // B
389
390#if MACRO_LEVEL>=2
391 INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
392#else
393 bl inner_kernel_gemm_add_nt_8x4_lib4
394#endif
395
396
397
398 // call inner blend for generic alpha and beta
399 mov x8, x1 // alpha
400 mov x9, x5 // beta
401 mov x10, x6 // C
402 mov w11, w7 // C
403 lsl w11, w11, #4 // 16*sdc
404
405#if MACRO_LEVEL>=1
406 INNER_SCALE_AB_8X4_LIB4
407#else
408 bl inner_scale_ab_8x4_lib4
409#endif
410
411
412
413 // store n
414 ldr x8, [sp, #(STACKSIZE + 0)] // D
415 ldr w9, [sp, #(STACKSIZE + 8)] // sdd
416 lsl w9, w9, #4 // 16*sdd
417
418#if MACRO_LEVEL>=1
419 INNER_STORE_8X4_LIB4
420#else
421 bl inner_store_8x4_lib4
422#endif
423
424
425
426 EPILOGUE
427
428 mov x0, #0
429
430 ret
431
432
433