blob: 354fa8316bf0addd4b73bde6cebdd8d33da1f62f [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#if defined(OS_LINUX) | defined(OS_MAC)
30
31//#define STACKSIZE 96
32#define STACKSIZE 64
33#define ARG1 %rdi
34#define ARG2 %rsi
35#define ARG3 %rdx
36#define ARG4 %rcx
37#define ARG5 %r8
38#define ARG6 %r9
39#define ARG7 STACKSIZE + 8(%rsp)
40#define ARG8 STACKSIZE + 16(%rsp)
41#define ARG9 STACKSIZE + 24(%rsp)
42#define ARG10 STACKSIZE + 32(%rsp)
43#define ARG11 STACKSIZE + 40(%rsp)
44#define ARG12 STACKSIZE + 48(%rsp)
45#define ARG13 STACKSIZE + 56(%rsp)
46#define ARG14 STACKSIZE + 64(%rsp)
47#define ARG15 STACKSIZE + 72(%rsp)
48#define ARG16 STACKSIZE + 80(%rsp)
49#define ARG17 STACKSIZE + 88(%rsp)
50#define ARG18 STACKSIZE + 96(%rsp)
51#define PROLOGUE \
52 subq $STACKSIZE, %rsp; \
53 movq %rbx, (%rsp); \
54 movq %rbp, 8(%rsp); \
55 movq %r12, 16(%rsp); \
56 movq %r13, 24(%rsp); \
57 movq %r14, 32(%rsp); \
58 movq %r15, 40(%rsp); \
59 vzeroupper;
60#define EPILOGUE \
61 vzeroupper; \
62 movq (%rsp), %rbx; \
63 movq 8(%rsp), %rbp; \
64 movq 16(%rsp), %r12; \
65 movq 24(%rsp), %r13; \
66 movq 32(%rsp), %r14; \
67 movq 40(%rsp), %r15; \
68 addq $STACKSIZE, %rsp;
69
70#elif defined(OS_WINDOWS)
71
72#define STACKSIZE 256
73#define ARG1 %rcx
74#define ARG2 %rdx
75#define ARG3 %r8
76#define ARG4 %r9
77#define ARG5 STACKSIZE + 40(%rsp)
78#define ARG6 STACKSIZE + 48(%rsp)
79#define ARG7 STACKSIZE + 56(%rsp)
80#define ARG8 STACKSIZE + 64(%rsp)
81#define ARG9 STACKSIZE + 72(%rsp)
82#define ARG10 STACKSIZE + 80(%rsp)
83#define ARG11 STACKSIZE + 88(%rsp)
84#define ARG12 STACKSIZE + 96(%rsp)
85#define ARG13 STACKSIZE + 104(%rsp)
86#define ARG14 STACKSIZE + 112(%rsp)
87#define ARG15 STACKSIZE + 120(%rsp)
88#define ARG16 STACKSIZE + 128(%rsp)
89#define ARG17 STACKSIZE + 136(%rsp)
90#define ARG18 STACKSIZE + 144(%rsp)
91#define PROLOGUE \
92 subq $STACKSIZE, %rsp; \
93 movq %rbx, (%rsp); \
94 movq %rbp, 8(%rsp); \
95 movq %r12, 16(%rsp); \
96 movq %r13, 24(%rsp); \
97 movq %r14, 32(%rsp); \
98 movq %r15, 40(%rsp); \
99 movq %rdi, 48(%rsp); \
100 movq %rsi, 56(%rsp); \
101 vmovups %xmm6, 64(%rsp); \
102 vmovups %xmm7, 80(%rsp); \
103 vmovups %xmm8, 96(%rsp); \
104 vmovups %xmm9, 112(%rsp); \
105 vmovups %xmm10, 128(%rsp); \
106 vmovups %xmm11, 144(%rsp); \
107 vmovups %xmm12, 160(%rsp); \
108 vmovups %xmm13, 176(%rsp); \
109 vmovups %xmm14, 192(%rsp); \
110 vmovups %xmm15, 208(%rsp); \
111 vzeroupper;
112#define EPILOGUE \
113 vzeroupper; \
114 movq (%rsp), %rbx; \
115 movq 8(%rsp), %rbp; \
116 movq 16(%rsp), %r12; \
117 movq 24(%rsp), %r13; \
118 movq 32(%rsp), %r14; \
119 movq 40(%rsp), %r15; \
120 movq 48(%rsp), %rdi; \
121 movq 56(%rsp), %rsi; \
122 vmovups 64(%rsp), %xmm6; \
123 vmovups 80(%rsp), %xmm7; \
124 vmovups 96(%rsp), %xmm8; \
125 vmovups 112(%rsp), %xmm9; \
126 vmovups 128(%rsp), %xmm10; \
127 vmovups 144(%rsp), %xmm11; \
128 vmovups 160(%rsp), %xmm12; \
129 vmovups 176(%rsp), %xmm13; \
130 vmovups 192(%rsp), %xmm14; \
131 vmovups 208(%rsp), %xmm15; \
132 addq $STACKSIZE, %rsp;
133
134#else
135
136#error wrong OS
137
138#endif
139
140
141
142#if defined(OS_LINUX) | defined(OS_WINDOWS)
143 .text
144#elif defined(OS_MAC)
145 .section __TEXT,__text,regular,pure_instructions
146#endif
147
148
149
150// common inner routine with file scope
151//
152// input arguments:
153// r10d <- k
154// r11 <- A
155// r12 <- B
156// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
157// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
158// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
159// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
160// ymm4 <- []
161// ymm5 <- []
162// ymm6 <- []
163// ymm7 <- []
164// ymm8 <- dirty
165// ymm9 <- dirty
166// ymm10 <- dirty
167// ymm11 <- dirty
168// ymm12 <- dirty
169// ymm13 <- dirty
170// ymm14 <- dirty
171// ymm15 <- dirty
172
173//
174// output arguments:
175// r10d <- 0
176// r11 <- A+4*k*sizeof(double)
177// r12 <- B+4*k*sizeof(double)
178// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
179// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
180// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
181// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
182// ymm4 <- []
183// ymm5 <- []
184// ymm6 <- []
185// ymm7 <- []
186// ymm8 <- dirty
187// ymm9 <- dirty
188// ymm10 <- dirty
189// ymm11 <- dirty
190// ymm12 <- dirty
191// ymm13 <- dirty
192// ymm14 <- dirty
193// ymm15 <- dirty
194
195#if MACRO_LEVEL>=2
196 .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
197#else
198 .p2align 4,,15
199#if defined(OS_LINUX)
200 .type inner_kernel_gemm_add_nt_8x8_lib8, @function
201inner_kernel_gemm_add_nt_8x8_lib8:
202#elif defined(OS_MAC)
203_inner_kernel_gemm_add_nt_8x8_lib8:
204#elif defined(OS_WINDOWS)
205 .def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
206inner_kernel_gemm_add_nt_8x8_lib8:
207#endif
208#endif
209
210 cmpl $0, %r10d
211 jle 2f // return
212
213 // preload
214 vbroadcastf128 0(%r12), %ymm14 // B
215 vmovaps 0(%r11), %ymm12 // A
216 vbroadcastf128 16(%r12), %ymm15 // B
217 vmovaps 32(%r11), %ymm13 // A
218
219 cmpl $4, %r10d
220 jle 0f // consider clean-up loop
221
222 // main loop
223 .p2align 3
2241: // main loop
225
226 // unroll 0
227 vmulps %ymm12, %ymm14, %ymm11
228 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
229 vaddps %ymm11, %ymm0, %ymm0
230
231 vmulps %ymm12, %ymm14, %ymm11
232 vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
233 vaddps %ymm11, %ymm1, %ymm1
234
235 vmulps %ymm12, %ymm14, %ymm11
236 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
237 vaddps %ymm11, %ymm2, %ymm2
238
239 vmulps %ymm12, %ymm14, %ymm11
240 vbroadcastf128 32(%r12), %ymm14 // B
241 vaddps %ymm11, %ymm3, %ymm3
242
243 vmulps %ymm12, %ymm15, %ymm11
244 vshufps $0xb1, %ymm15, %ymm15, %ymm15
245 vaddps %ymm11, %ymm4, %ymm4
246
247 vmulps %ymm12, %ymm15, %ymm11
248 vshufps $0x4e, %ymm15, %ymm15, %ymm15
249 vaddps %ymm11, %ymm5, %ymm5
250
251 vmulps %ymm12, %ymm15, %ymm11
252 vshufps $0xb1, %ymm15, %ymm15, %ymm15
253 vaddps %ymm11, %ymm6, %ymm6
254
255 vmulps %ymm12, %ymm15, %ymm11
256 vbroadcastf128 48(%r12), %ymm15 // B
257 vaddps %ymm11, %ymm7, %ymm7
258 vmovaps 64(%r11), %ymm12 // A
259
260
261 // unroll 1
262 vmulps %ymm13, %ymm14, %ymm11
263 vshufps $0xb1, %ymm14, %ymm14, %ymm14
264 vaddps %ymm11, %ymm0, %ymm0
265
266 vmulps %ymm13, %ymm14, %ymm11
267 vshufps $0x4e, %ymm14, %ymm14, %ymm14
268 vaddps %ymm11, %ymm1, %ymm1
269
270 vmulps %ymm13, %ymm14, %ymm11
271 vshufps $0xb1, %ymm14, %ymm14, %ymm14
272 vaddps %ymm11, %ymm2, %ymm2
273
274 vmulps %ymm13, %ymm14, %ymm11
275 vbroadcastf128 64(%r12), %ymm14 // B
276 vaddps %ymm11, %ymm3, %ymm3
277
278 vmulps %ymm13, %ymm15, %ymm11
279 vshufps $0xb1, %ymm15, %ymm15, %ymm15
280 vaddps %ymm11, %ymm4, %ymm4
281
282 vmulps %ymm13, %ymm15, %ymm11
283 vshufps $0x4e, %ymm15, %ymm15, %ymm15
284 vaddps %ymm11, %ymm5, %ymm5
285
286 vmulps %ymm13, %ymm15, %ymm11
287 vshufps $0xb1, %ymm15, %ymm15, %ymm15
288 vaddps %ymm11, %ymm6, %ymm6
289
290 vmulps %ymm13, %ymm15, %ymm11
291 vbroadcastf128 80(%r12), %ymm15 // B
292 vaddps %ymm11, %ymm7, %ymm7
293 vmovaps 96(%r11), %ymm13 // A
294
295
296 // unroll 2
297 vmulps %ymm12, %ymm14, %ymm11
298 vshufps $0xb1, %ymm14, %ymm14, %ymm14
299 vaddps %ymm11, %ymm0, %ymm0
300
301 vmulps %ymm12, %ymm14, %ymm11
302 vshufps $0x4e, %ymm14, %ymm14, %ymm14
303 vaddps %ymm11, %ymm1, %ymm1
304
305 vmulps %ymm12, %ymm14, %ymm11
306 vshufps $0xb1, %ymm14, %ymm14, %ymm14
307 vaddps %ymm11, %ymm2, %ymm2
308
309 vmulps %ymm12, %ymm14, %ymm11
310 vbroadcastf128 96(%r12), %ymm14 // B
311 vaddps %ymm11, %ymm3, %ymm3
312
313 vmulps %ymm12, %ymm15, %ymm11
314 vshufps $0xb1, %ymm15, %ymm15, %ymm15
315 vaddps %ymm11, %ymm4, %ymm4
316
317 vmulps %ymm12, %ymm15, %ymm11
318 vshufps $0x4e, %ymm15, %ymm15, %ymm15
319 vaddps %ymm11, %ymm5, %ymm5
320
321 vmulps %ymm12, %ymm15, %ymm11
322 vshufps $0xb1, %ymm15, %ymm15, %ymm15
323 vaddps %ymm11, %ymm6, %ymm6
324
325 vmulps %ymm12, %ymm15, %ymm11
326 vbroadcastf128 112(%r12), %ymm15 // B
327 vaddps %ymm11, %ymm7, %ymm7
328 vmovaps 128(%r11), %ymm12 // A
329
330 subl $4, %r10d
331 addq $128, %r11
332 addq $128, %r12
333
334 // unroll 3
335 vmulps %ymm13, %ymm14, %ymm11
336 vshufps $0xb1, %ymm14, %ymm14, %ymm14
337 vaddps %ymm11, %ymm0, %ymm0
338
339 vmulps %ymm13, %ymm14, %ymm11
340 vshufps $0x4e, %ymm14, %ymm14, %ymm14
341 vaddps %ymm11, %ymm1, %ymm1
342
343 vmulps %ymm13, %ymm14, %ymm11
344 vshufps $0xb1, %ymm14, %ymm14, %ymm14
345 vaddps %ymm11, %ymm2, %ymm2
346
347 vmulps %ymm13, %ymm14, %ymm11
348 vbroadcastf128 0(%r12), %ymm14 // B
349 vaddps %ymm11, %ymm3, %ymm3
350
351 vmulps %ymm13, %ymm15, %ymm11
352 vshufps $0xb1, %ymm15, %ymm15, %ymm15
353 vaddps %ymm11, %ymm4, %ymm4
354
355 vmulps %ymm13, %ymm15, %ymm11
356 vshufps $0x4e, %ymm15, %ymm15, %ymm15
357 vaddps %ymm11, %ymm5, %ymm5
358
359 vmulps %ymm13, %ymm15, %ymm11
360 vshufps $0xb1, %ymm15, %ymm15, %ymm15
361 vaddps %ymm11, %ymm6, %ymm6
362
363 vmulps %ymm13, %ymm15, %ymm11
364 vbroadcastf128 16(%r12), %ymm15 // B
365 vaddps %ymm11, %ymm7, %ymm7
366 vmovaps 32(%r11), %ymm13 // A
367
368 cmpl $4, %r10d
369 jg 1b // main loop
370
371
3720: // consider clean4-up
373
374 cmpl $3, %r10d
375 jle 4f // clean1
376
377
378 // unroll 0
379 vmulps %ymm12, %ymm14, %ymm11
380 vshufps $0xb1, %ymm14, %ymm14, %ymm14
381 vaddps %ymm11, %ymm0, %ymm0
382
383 vmulps %ymm12, %ymm14, %ymm11
384 vshufps $0x4e, %ymm14, %ymm14, %ymm14
385 vaddps %ymm11, %ymm1, %ymm1
386
387 vmulps %ymm12, %ymm14, %ymm11
388 vshufps $0xb1, %ymm14, %ymm14, %ymm14
389 vaddps %ymm11, %ymm2, %ymm2
390
391 vmulps %ymm12, %ymm14, %ymm11
392 vbroadcastf128 32(%r12), %ymm14 // B
393 vaddps %ymm11, %ymm3, %ymm3
394
395 vmulps %ymm12, %ymm15, %ymm11
396 vshufps $0xb1, %ymm15, %ymm15, %ymm15
397 vaddps %ymm11, %ymm4, %ymm4
398
399 vmulps %ymm12, %ymm15, %ymm11
400 vshufps $0x4e, %ymm15, %ymm15, %ymm15
401 vaddps %ymm11, %ymm5, %ymm5
402
403 vmulps %ymm12, %ymm15, %ymm11
404 vshufps $0xb1, %ymm15, %ymm15, %ymm15
405 vaddps %ymm11, %ymm6, %ymm6
406
407 vmulps %ymm12, %ymm15, %ymm11
408 vbroadcastf128 48(%r12), %ymm15 // B
409 vaddps %ymm11, %ymm7, %ymm7
410 vmovaps 64(%r11), %ymm12 // A
411
412
413 // unroll 1
414 vmulps %ymm13, %ymm14, %ymm11
415 vshufps $0xb1, %ymm14, %ymm14, %ymm14
416 vaddps %ymm11, %ymm0, %ymm0
417
418 vmulps %ymm13, %ymm14, %ymm11
419 vshufps $0x4e, %ymm14, %ymm14, %ymm14
420 vaddps %ymm11, %ymm1, %ymm1
421
422 vmulps %ymm13, %ymm14, %ymm11
423 vshufps $0xb1, %ymm14, %ymm14, %ymm14
424 vaddps %ymm11, %ymm2, %ymm2
425
426 vmulps %ymm13, %ymm14, %ymm11
427 vbroadcastf128 64(%r12), %ymm14 // B
428 vaddps %ymm11, %ymm3, %ymm3
429
430 vmulps %ymm13, %ymm15, %ymm11
431 vshufps $0xb1, %ymm15, %ymm15, %ymm15
432 vaddps %ymm11, %ymm4, %ymm4
433
434 vmulps %ymm13, %ymm15, %ymm11
435 vshufps $0x4e, %ymm15, %ymm15, %ymm15
436 vaddps %ymm11, %ymm5, %ymm5
437
438 vmulps %ymm13, %ymm15, %ymm11
439 vshufps $0xb1, %ymm15, %ymm15, %ymm15
440 vaddps %ymm11, %ymm6, %ymm6
441
442 vmulps %ymm13, %ymm15, %ymm11
443 vbroadcastf128 80(%r12), %ymm15 // B
444 vaddps %ymm11, %ymm7, %ymm7
445 vmovaps 96(%r11), %ymm13 // A
446
447
448 // unroll 2
449 vmulps %ymm12, %ymm14, %ymm11
450 vshufps $0xb1, %ymm14, %ymm14, %ymm14
451 vaddps %ymm11, %ymm0, %ymm0
452
453 vmulps %ymm12, %ymm14, %ymm11
454 vshufps $0x4e, %ymm14, %ymm14, %ymm14
455 vaddps %ymm11, %ymm1, %ymm1
456
457 vmulps %ymm12, %ymm14, %ymm11
458 vshufps $0xb1, %ymm14, %ymm14, %ymm14
459 vaddps %ymm11, %ymm2, %ymm2
460
461 vmulps %ymm12, %ymm14, %ymm11
462 vbroadcastf128 96(%r12), %ymm14 // B
463 vaddps %ymm11, %ymm3, %ymm3
464
465 vmulps %ymm12, %ymm15, %ymm11
466 vshufps $0xb1, %ymm15, %ymm15, %ymm15
467 vaddps %ymm11, %ymm4, %ymm4
468
469 vmulps %ymm12, %ymm15, %ymm11
470 vshufps $0x4e, %ymm15, %ymm15, %ymm15
471 vaddps %ymm11, %ymm5, %ymm5
472
473 vmulps %ymm12, %ymm15, %ymm11
474 vshufps $0xb1, %ymm15, %ymm15, %ymm15
475 vaddps %ymm11, %ymm6, %ymm6
476
477 vmulps %ymm12, %ymm15, %ymm11
478 vbroadcastf128 112(%r12), %ymm15 // B
479 vaddps %ymm11, %ymm7, %ymm7
480// vmovaps 128(%r11), %ymm12 // A
481
482 subl $4, %r10d
483 addq $128, %r11
484 addq $128, %r12
485
486 // unroll 3
487 vmulps %ymm13, %ymm14, %ymm11
488 vshufps $0xb1, %ymm14, %ymm14, %ymm14
489 vaddps %ymm11, %ymm0, %ymm0
490
491 vmulps %ymm13, %ymm14, %ymm11
492 vshufps $0x4e, %ymm14, %ymm14, %ymm14
493 vaddps %ymm11, %ymm1, %ymm1
494
495 vmulps %ymm13, %ymm14, %ymm11
496 vshufps $0xb1, %ymm14, %ymm14, %ymm14
497 vaddps %ymm11, %ymm2, %ymm2
498
499 vmulps %ymm13, %ymm14, %ymm11
500// vbroadcastf128 0(%r12), %ymm14 // B
501 vaddps %ymm11, %ymm3, %ymm3
502
503 vmulps %ymm13, %ymm15, %ymm11
504 vshufps $0xb1, %ymm15, %ymm15, %ymm15
505 vaddps %ymm11, %ymm4, %ymm4
506
507 vmulps %ymm13, %ymm15, %ymm11
508 vshufps $0x4e, %ymm15, %ymm15, %ymm15
509 vaddps %ymm11, %ymm5, %ymm5
510
511 vmulps %ymm13, %ymm15, %ymm11
512 vshufps $0xb1, %ymm15, %ymm15, %ymm15
513 vaddps %ymm11, %ymm6, %ymm6
514
515 vmulps %ymm13, %ymm15, %ymm11
516// vbroadcastf128 16(%r12), %ymm15 // B
517 vaddps %ymm11, %ymm7, %ymm7
518// vmovaps 32(%r11), %ymm13 // A
519
520
521// cmpl $4, %r10d
522 jmp 2f // return
523
524
5254: // consider clean1-up loop
526
527 cmpl $0, %r10d
528 jle 2f // return
529
530 // clean-up loop
5313: // clean up loop
532
533 // unroll 0
534 vbroadcastf128 0(%r12), %ymm14 // B
535 vmovaps 0(%r11), %ymm12 // A
536 vmulps %ymm12, %ymm14, %ymm11
537 vaddps %ymm11, %ymm0, %ymm0
538
539 vshufps $0xb1, %ymm14, %ymm14, %ymm14
540 vmulps %ymm12, %ymm14, %ymm11
541 vaddps %ymm11, %ymm1, %ymm1
542
543 vshufps $0x4e, %ymm14, %ymm14, %ymm14
544 vmulps %ymm12, %ymm14, %ymm11
545 vaddps %ymm11, %ymm2, %ymm2
546
547 vshufps $0xb1, %ymm14, %ymm14, %ymm14
548 vmulps %ymm12, %ymm14, %ymm11
549 vaddps %ymm11, %ymm3, %ymm3
550
551 vbroadcastf128 16(%r12), %ymm14 // B
552 vmulps %ymm12, %ymm14, %ymm11
553 vaddps %ymm11, %ymm4, %ymm4
554
555 vshufps $0xb1, %ymm14, %ymm14, %ymm14
556 vmulps %ymm12, %ymm14, %ymm11
557 vaddps %ymm11, %ymm5, %ymm5
558
559 vshufps $0x4e, %ymm14, %ymm14, %ymm14
560 vmulps %ymm12, %ymm14, %ymm11
561 vaddps %ymm11, %ymm6, %ymm6
562
563 subl $1, %r10d
564 addq $32, %r11
565 addq $32, %r12
566
567 vshufps $0xb1, %ymm14, %ymm14, %ymm14
568 vmulps %ymm12, %ymm14, %ymm11
569 vaddps %ymm11, %ymm7, %ymm7
570
571 cmpl $0, %r10d
572 jg 3b // clean up loop
573
574
5752: // return
576
577#if MACRO_LEVEL>=2
578 .endm
579#else
580 ret
581
582#if defined(OS_LINUX)
583 .size inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
584#endif
585#endif
586
587
588
589
590
591// common inner routine with file scope
592//
593// input arguments:
594// r10d <- k
595// r11 <- A
596// r12 <- B
597// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
598// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
599// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
600// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
601// ymm4 <- []
602// ymm5 <- []
603// ymm6 <- []
604// ymm7 <- []
605// ymm8 <- dirty
606// ymm9 <- dirty
607// ymm10 <- dirty
608// ymm11 <- dirty
609// ymm12 <- dirty
610// ymm13 <- dirty
611// ymm14 <- dirty
612// ymm15 <- dirty
613
614//
615// output arguments:
616// r10d <- 0
617// r11 <- A+4*k*sizeof(double)
618// r12 <- B+4*k*sizeof(double)
619// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
620// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
621// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
622// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
623// ymm4 <- []
624// ymm5 <- []
625// ymm6 <- []
626// ymm7 <- []
627// ymm8 <- dirty
628// ymm9 <- dirty
629// ymm10 <- dirty
630// ymm11 <- dirty
631// ymm12 <- dirty
632// ymm13 <- dirty
633// ymm14 <- dirty
634// ymm15 <- dirty
635
636#if MACRO_LEVEL>=2
637 .macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
638#else
639 .p2align 4,,15
640#if defined(OS_LINUX)
641 .type inner_kernel_gemm_sub_nt_8x8_lib8, @function
642inner_kernel_gemm_sub_nt_8x8_lib8:
643#elif defined(OS_MAC)
644_inner_kernel_gemm_sub_nt_8x8_lib8:
645#elif defined(OS_WINDOWS)
646 .def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
647inner_kernel_gemm_sub_nt_8x8_lib8:
648#endif
649#endif
650
651 cmpl $0, %r10d
652 jle 2f // return
653
654 // preload
655 vbroadcastf128 0(%r12), %ymm14 // B
656 vmovaps 0(%r11), %ymm12 // A
657 vbroadcastf128 16(%r12), %ymm15 // B
658 vmovaps 32(%r11), %ymm13 // A
659
660 cmpl $4, %r10d
661 jle 0f // consider clean-up loop
662
663 // main loop
664 .p2align 3
6651: // main loop
666
667 // unroll 0
668 vmulps %ymm12, %ymm14, %ymm11
669 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
670 vsubps %ymm11, %ymm0, %ymm0
671
672 vmulps %ymm12, %ymm14, %ymm11
673 vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
674 vsubps %ymm11, %ymm1, %ymm1
675
676 vmulps %ymm12, %ymm14, %ymm11
677 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
678 vsubps %ymm11, %ymm2, %ymm2
679
680 vmulps %ymm12, %ymm14, %ymm11
681 vbroadcastf128 32(%r12), %ymm14 // B
682 vsubps %ymm11, %ymm3, %ymm3
683
684 vmulps %ymm12, %ymm15, %ymm11
685 vshufps $0xb1, %ymm15, %ymm15, %ymm15
686 vsubps %ymm11, %ymm4, %ymm4
687
688 vmulps %ymm12, %ymm15, %ymm11
689 vshufps $0x4e, %ymm15, %ymm15, %ymm15
690 vsubps %ymm11, %ymm5, %ymm5
691
692 vmulps %ymm12, %ymm15, %ymm11
693 vshufps $0xb1, %ymm15, %ymm15, %ymm15
694 vsubps %ymm11, %ymm6, %ymm6
695
696 vmulps %ymm12, %ymm15, %ymm11
697 vbroadcastf128 48(%r12), %ymm15 // B
698 vsubps %ymm11, %ymm7, %ymm7
699 vmovaps 64(%r11), %ymm12 // A
700
701
702 // unroll 1
703 vmulps %ymm13, %ymm14, %ymm11
704 vshufps $0xb1, %ymm14, %ymm14, %ymm14
705 vsubps %ymm11, %ymm0, %ymm0
706
707 vmulps %ymm13, %ymm14, %ymm11
708 vshufps $0x4e, %ymm14, %ymm14, %ymm14
709 vsubps %ymm11, %ymm1, %ymm1
710
711 vmulps %ymm13, %ymm14, %ymm11
712 vshufps $0xb1, %ymm14, %ymm14, %ymm14
713 vsubps %ymm11, %ymm2, %ymm2
714
715 vmulps %ymm13, %ymm14, %ymm11
716 vbroadcastf128 64(%r12), %ymm14 // B
717 vsubps %ymm11, %ymm3, %ymm3
718
719 vmulps %ymm13, %ymm15, %ymm11
720 vshufps $0xb1, %ymm15, %ymm15, %ymm15
721 vsubps %ymm11, %ymm4, %ymm4
722
723 vmulps %ymm13, %ymm15, %ymm11
724 vshufps $0x4e, %ymm15, %ymm15, %ymm15
725 vsubps %ymm11, %ymm5, %ymm5
726
727 vmulps %ymm13, %ymm15, %ymm11
728 vshufps $0xb1, %ymm15, %ymm15, %ymm15
729 vsubps %ymm11, %ymm6, %ymm6
730
731 vmulps %ymm13, %ymm15, %ymm11
732 vbroadcastf128 80(%r12), %ymm15 // B
733 vsubps %ymm11, %ymm7, %ymm7
734 vmovaps 96(%r11), %ymm13 // A
735
736
737 // unroll 2
738 vmulps %ymm12, %ymm14, %ymm11
739 vshufps $0xb1, %ymm14, %ymm14, %ymm14
740 vsubps %ymm11, %ymm0, %ymm0
741
742 vmulps %ymm12, %ymm14, %ymm11
743 vshufps $0x4e, %ymm14, %ymm14, %ymm14
744 vsubps %ymm11, %ymm1, %ymm1
745
746 vmulps %ymm12, %ymm14, %ymm11
747 vshufps $0xb1, %ymm14, %ymm14, %ymm14
748 vsubps %ymm11, %ymm2, %ymm2
749
750 vmulps %ymm12, %ymm14, %ymm11
751 vbroadcastf128 96(%r12), %ymm14 // B
752 vsubps %ymm11, %ymm3, %ymm3
753
754 vmulps %ymm12, %ymm15, %ymm11
755 vshufps $0xb1, %ymm15, %ymm15, %ymm15
756 vsubps %ymm11, %ymm4, %ymm4
757
758 vmulps %ymm12, %ymm15, %ymm11
759 vshufps $0x4e, %ymm15, %ymm15, %ymm15
760 vsubps %ymm11, %ymm5, %ymm5
761
762 vmulps %ymm12, %ymm15, %ymm11
763 vshufps $0xb1, %ymm15, %ymm15, %ymm15
764 vsubps %ymm11, %ymm6, %ymm6
765
766 vmulps %ymm12, %ymm15, %ymm11
767 vbroadcastf128 112(%r12), %ymm15 // B
768 vsubps %ymm11, %ymm7, %ymm7
769 vmovaps 128(%r11), %ymm12 // A
770
771 subl $4, %r10d
772 addq $128, %r11
773 addq $128, %r12
774
775 // unroll 3
776 vmulps %ymm13, %ymm14, %ymm11
777 vshufps $0xb1, %ymm14, %ymm14, %ymm14
778 vsubps %ymm11, %ymm0, %ymm0
779
780 vmulps %ymm13, %ymm14, %ymm11
781 vshufps $0x4e, %ymm14, %ymm14, %ymm14
782 vsubps %ymm11, %ymm1, %ymm1
783
784 vmulps %ymm13, %ymm14, %ymm11
785 vshufps $0xb1, %ymm14, %ymm14, %ymm14
786 vsubps %ymm11, %ymm2, %ymm2
787
788 vmulps %ymm13, %ymm14, %ymm11
789 vbroadcastf128 0(%r12), %ymm14 // B
790 vsubps %ymm11, %ymm3, %ymm3
791
792 vmulps %ymm13, %ymm15, %ymm11
793 vshufps $0xb1, %ymm15, %ymm15, %ymm15
794 vsubps %ymm11, %ymm4, %ymm4
795
796 vmulps %ymm13, %ymm15, %ymm11
797 vshufps $0x4e, %ymm15, %ymm15, %ymm15
798 vsubps %ymm11, %ymm5, %ymm5
799
800 vmulps %ymm13, %ymm15, %ymm11
801 vshufps $0xb1, %ymm15, %ymm15, %ymm15
802 vsubps %ymm11, %ymm6, %ymm6
803
804 vmulps %ymm13, %ymm15, %ymm11
805 vbroadcastf128 16(%r12), %ymm15 // B
806 vsubps %ymm11, %ymm7, %ymm7
807 vmovaps 32(%r11), %ymm13 // A
808
809 cmpl $4, %r10d
810 jg 1b // main loop
811
812
8130: // consider clean4-up
814
815 cmpl $3, %r10d
816 jle 4f // clean1
817
818
819 // unroll 0
820 vmulps %ymm12, %ymm14, %ymm11
821 vshufps $0xb1, %ymm14, %ymm14, %ymm14
822 vsubps %ymm11, %ymm0, %ymm0
823
824 vmulps %ymm12, %ymm14, %ymm11
825 vshufps $0x4e, %ymm14, %ymm14, %ymm14
826 vsubps %ymm11, %ymm1, %ymm1
827
828 vmulps %ymm12, %ymm14, %ymm11
829 vshufps $0xb1, %ymm14, %ymm14, %ymm14
830 vsubps %ymm11, %ymm2, %ymm2
831
832 vmulps %ymm12, %ymm14, %ymm11
833 vbroadcastf128 32(%r12), %ymm14 // B
834 vsubps %ymm11, %ymm3, %ymm3
835
836 vmulps %ymm12, %ymm15, %ymm11
837 vshufps $0xb1, %ymm15, %ymm15, %ymm15
838 vsubps %ymm11, %ymm4, %ymm4
839
840 vmulps %ymm12, %ymm15, %ymm11
841 vshufps $0x4e, %ymm15, %ymm15, %ymm15
842 vsubps %ymm11, %ymm5, %ymm5
843
844 vmulps %ymm12, %ymm15, %ymm11
845 vshufps $0xb1, %ymm15, %ymm15, %ymm15
846 vsubps %ymm11, %ymm6, %ymm6
847
848 vmulps %ymm12, %ymm15, %ymm11
849 vbroadcastf128 48(%r12), %ymm15 // B
850 vsubps %ymm11, %ymm7, %ymm7
851 vmovaps 64(%r11), %ymm12 // A
852
853
854 // unroll 1
855 vmulps %ymm13, %ymm14, %ymm11
856 vshufps $0xb1, %ymm14, %ymm14, %ymm14
857 vsubps %ymm11, %ymm0, %ymm0
858
859 vmulps %ymm13, %ymm14, %ymm11
860 vshufps $0x4e, %ymm14, %ymm14, %ymm14
861 vsubps %ymm11, %ymm1, %ymm1
862
863 vmulps %ymm13, %ymm14, %ymm11
864 vshufps $0xb1, %ymm14, %ymm14, %ymm14
865 vsubps %ymm11, %ymm2, %ymm2
866
867 vmulps %ymm13, %ymm14, %ymm11
868 vbroadcastf128 64(%r12), %ymm14 // B
869 vsubps %ymm11, %ymm3, %ymm3
870
871 vmulps %ymm13, %ymm15, %ymm11
872 vshufps $0xb1, %ymm15, %ymm15, %ymm15
873 vsubps %ymm11, %ymm4, %ymm4
874
875 vmulps %ymm13, %ymm15, %ymm11
876 vshufps $0x4e, %ymm15, %ymm15, %ymm15
877 vsubps %ymm11, %ymm5, %ymm5
878
879 vmulps %ymm13, %ymm15, %ymm11
880 vshufps $0xb1, %ymm15, %ymm15, %ymm15
881 vsubps %ymm11, %ymm6, %ymm6
882
883 vmulps %ymm13, %ymm15, %ymm11
884 vbroadcastf128 80(%r12), %ymm15 // B
885 vsubps %ymm11, %ymm7, %ymm7
886 vmovaps 96(%r11), %ymm13 // A
887
888
889 // unroll 2
890 vmulps %ymm12, %ymm14, %ymm11
891 vshufps $0xb1, %ymm14, %ymm14, %ymm14
892 vsubps %ymm11, %ymm0, %ymm0
893
894 vmulps %ymm12, %ymm14, %ymm11
895 vshufps $0x4e, %ymm14, %ymm14, %ymm14
896 vsubps %ymm11, %ymm1, %ymm1
897
898 vmulps %ymm12, %ymm14, %ymm11
899 vshufps $0xb1, %ymm14, %ymm14, %ymm14
900 vsubps %ymm11, %ymm2, %ymm2
901
902 vmulps %ymm12, %ymm14, %ymm11
903 vbroadcastf128 96(%r12), %ymm14 // B
904 vsubps %ymm11, %ymm3, %ymm3
905
906 vmulps %ymm12, %ymm15, %ymm11
907 vshufps $0xb1, %ymm15, %ymm15, %ymm15
908 vsubps %ymm11, %ymm4, %ymm4
909
910 vmulps %ymm12, %ymm15, %ymm11
911 vshufps $0x4e, %ymm15, %ymm15, %ymm15
912 vsubps %ymm11, %ymm5, %ymm5
913
914 vmulps %ymm12, %ymm15, %ymm11
915 vshufps $0xb1, %ymm15, %ymm15, %ymm15
916 vsubps %ymm11, %ymm6, %ymm6
917
918 vmulps %ymm12, %ymm15, %ymm11
919 vbroadcastf128 112(%r12), %ymm15 // B
920 vsubps %ymm11, %ymm7, %ymm7
921// vmovaps 128(%r11), %ymm12 // A
922
923 subl $4, %r10d
924 addq $128, %r11
925 addq $128, %r12
926
927 // unroll 3
928 vmulps %ymm13, %ymm14, %ymm11
929 vshufps $0xb1, %ymm14, %ymm14, %ymm14
930 vsubps %ymm11, %ymm0, %ymm0
931
932 vmulps %ymm13, %ymm14, %ymm11
933 vshufps $0x4e, %ymm14, %ymm14, %ymm14
934 vsubps %ymm11, %ymm1, %ymm1
935
936 vmulps %ymm13, %ymm14, %ymm11
937 vshufps $0xb1, %ymm14, %ymm14, %ymm14
938 vsubps %ymm11, %ymm2, %ymm2
939
940 vmulps %ymm13, %ymm14, %ymm11
941// vbroadcastf128 0(%r12), %ymm14 // B
942 vsubps %ymm11, %ymm3, %ymm3
943
944 vmulps %ymm13, %ymm15, %ymm11
945 vshufps $0xb1, %ymm15, %ymm15, %ymm15
946 vsubps %ymm11, %ymm4, %ymm4
947
948 vmulps %ymm13, %ymm15, %ymm11
949 vshufps $0x4e, %ymm15, %ymm15, %ymm15
950 vsubps %ymm11, %ymm5, %ymm5
951
952 vmulps %ymm13, %ymm15, %ymm11
953 vshufps $0xb1, %ymm15, %ymm15, %ymm15
954 vsubps %ymm11, %ymm6, %ymm6
955
956 vmulps %ymm13, %ymm15, %ymm11
957// vbroadcastf128 16(%r12), %ymm15 // B
958 vsubps %ymm11, %ymm7, %ymm7
959// vmovaps 32(%r11), %ymm13 // A
960
961
962// cmpl $4, %r10d
963 jmp 2f // return
964
965
9664: // consider clean1-up loop
967
968 cmpl $0, %r10d
969 jle 2f // return
970
971 // clean-up loop
9723: // clean up loop
973
974 // unroll 0
975 vbroadcastf128 0(%r12), %ymm14 // B
976 vmovaps 0(%r11), %ymm12 // A
977 vmulps %ymm12, %ymm14, %ymm11
978 vsubps %ymm11, %ymm0, %ymm0
979
980 vshufps $0xb1, %ymm14, %ymm14, %ymm14
981 vmulps %ymm12, %ymm14, %ymm11
982 vsubps %ymm11, %ymm1, %ymm1
983
984 vshufps $0x4e, %ymm14, %ymm14, %ymm14
985 vmulps %ymm12, %ymm14, %ymm11
986 vsubps %ymm11, %ymm2, %ymm2
987
988 vshufps $0xb1, %ymm14, %ymm14, %ymm14
989 vmulps %ymm12, %ymm14, %ymm11
990 vsubps %ymm11, %ymm3, %ymm3
991
992 vbroadcastf128 16(%r12), %ymm14 // B
993 vmulps %ymm12, %ymm14, %ymm11
994 vsubps %ymm11, %ymm4, %ymm4
995
996 vshufps $0xb1, %ymm14, %ymm14, %ymm14
997 vmulps %ymm12, %ymm14, %ymm11
998 vsubps %ymm11, %ymm5, %ymm5
999
1000 vshufps $0x4e, %ymm14, %ymm14, %ymm14
1001 vmulps %ymm12, %ymm14, %ymm11
1002 vsubps %ymm11, %ymm6, %ymm6
1003
1004 subl $1, %r10d
1005 addq $32, %r11
1006 addq $32, %r12
1007
1008 vshufps $0xb1, %ymm14, %ymm14, %ymm14
1009 vmulps %ymm12, %ymm14, %ymm11
1010 vsubps %ymm11, %ymm7, %ymm7
1011
1012 cmpl $0, %r10d
1013 jg 3b // clean up loop
1014
1015
10162: // return
1017
1018#if MACRO_LEVEL>=2
1019 .endm
1020#else
1021 ret
1022
1023#if defined(OS_LINUX)
1024 .size inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
1025#endif
1026#endif
1027
1028
1029
1030
1031
1032// common inner routine with file scope
1033//
1034// input arguments:
1035// r10d <- k
1036// r11 <- A
1037// r12 <- B
1038// r13 <- 4*sdb*sizeof(double)
1039// r14 <= dirty
1040// ymm0 <- []
1041// ymm1 <- []
1042// ymm2 <- []
1043// ymm3 <- []
1044// ymm8 <- dirty
1045// ymm9 <- dirty
1046// ymm10 <- dirty
1047// ymm11 <- dirty
1048// ymm12 <- dirty
1049// ymm13 <- dirty
1050// ymm14 <- dirty
1051// ymm15 <- dirty
1052
1053//
1054// output arguments:
1055// r10d <- 0
1056// r11 <- A+4*k*sizeof(double)
1057// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
1058// r13 <- 4*sdb*sizeof(double)
1059// r14 <= dirty
1060// ymm0 <- []
1061// ymm1 <- []
1062// ymm2 <- []
1063// ymm3 <- []
1064// ymm8 <- dirty
1065// ymm9 <- dirty
1066// ymm10 <- dirty
1067// ymm11 <- dirty
1068// ymm12 <- dirty
1069// ymm13 <- dirty
1070// ymm14 <- dirty
1071// ymm15 <- dirty
1072
1073#if MACRO_LEVEL>=2
1074 .macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
1075#else
1076 .p2align 4,,15
1077#if defined(OS_LINUX)
1078 .type inner_kernel_gemm_add_nn_8x8_lib8, @function
1079inner_kernel_gemm_add_nn_8x8_lib8:
1080#elif defined(OS_MAC)
1081_inner_kernel_gemm_add_nn_8x8_lib8:
1082#elif defined(OS_WINDOWS)
1083 .def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
1084inner_kernel_gemm_add_nn_8x8_lib8:
1085#endif
1086#endif
1087
1088 cmpl $0, %r10d
1089 jle 2f // return
1090
1091 movq %r12, %r14 // B_next <- B
1092 addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
1093
1094 cmpl $8, %r10d
1095 jl 0f // consider clean-up loop
1096
1097 // main loop
1098 .p2align 3
10991: // main loop
1100
1101 prefetcht0 0(%r14) // software prefetch
1102 prefetcht0 64(%r14) // software prefetch
1103 prefetcht0 128(%r14) // software prefetch
1104 prefetcht0 192(%r14) // software prefetch
1105
1106 // unroll 0
1107 vmovaps 0(%r11), %ymm12 // A[0]
1108 vbroadcastss 0(%r12), %ymm13 // B[0]
1109 vmulps %ymm12, %ymm13, %ymm15
1110 vaddps %ymm15, %ymm0, %ymm0
1111 vbroadcastss 32(%r12), %ymm13 // B[1]
1112 vmulps %ymm12, %ymm13, %ymm15
1113 vaddps %ymm15, %ymm1, %ymm1
1114 vbroadcastss 64(%r12), %ymm13 // B[2]
1115 vmulps %ymm12, %ymm13, %ymm15
1116 vaddps %ymm15, %ymm2, %ymm2
1117 vbroadcastss 96(%r12), %ymm13 // B[3]
1118 vmulps %ymm12, %ymm13, %ymm15
1119 vaddps %ymm15, %ymm3, %ymm3
1120 vbroadcastss 128(%r12), %ymm13 // B[4]
1121 vmulps %ymm12, %ymm13, %ymm15
1122 vaddps %ymm15, %ymm4, %ymm4
1123 vbroadcastss 160(%r12), %ymm13 // B[5]
1124 vmulps %ymm12, %ymm13, %ymm15
1125 vaddps %ymm15, %ymm5, %ymm5
1126 vbroadcastss 192(%r12), %ymm13 // B[6]
1127 vmulps %ymm12, %ymm13, %ymm15
1128 vaddps %ymm15, %ymm6, %ymm6
1129 vbroadcastss 224(%r12), %ymm13 // B[7]
1130 vmulps %ymm12, %ymm13, %ymm15
1131 vaddps %ymm15, %ymm7, %ymm7
1132
1133
1134 // unroll 1
1135 vmovaps 32(%r11), %ymm12 // A[0]
1136 vbroadcastss 4(%r12), %ymm13 // B[0]
1137 vmulps %ymm12, %ymm13, %ymm15
1138 vaddps %ymm15, %ymm0, %ymm0
1139 vbroadcastss 36(%r12), %ymm13 // B[1]
1140 vmulps %ymm12, %ymm13, %ymm15
1141 vaddps %ymm15, %ymm1, %ymm1
1142 vbroadcastss 68(%r12), %ymm13 // B[2]
1143 vmulps %ymm12, %ymm13, %ymm15
1144 vaddps %ymm15, %ymm2, %ymm2
1145 vbroadcastss 100(%r12), %ymm13 // B[3]
1146 vmulps %ymm12, %ymm13, %ymm15
1147 vaddps %ymm15, %ymm3, %ymm3
1148 vbroadcastss 132(%r12), %ymm13 // B[4]
1149 vmulps %ymm12, %ymm13, %ymm15
1150 vaddps %ymm15, %ymm4, %ymm4
1151 vbroadcastss 164(%r12), %ymm13 // B[5]
1152 vmulps %ymm12, %ymm13, %ymm15
1153 vaddps %ymm15, %ymm5, %ymm5
1154 vbroadcastss 196(%r12), %ymm13 // B[6]
1155 vmulps %ymm12, %ymm13, %ymm15
1156 vaddps %ymm15, %ymm6, %ymm6
1157 vbroadcastss 228(%r12), %ymm13 // B[7]
1158 vmulps %ymm12, %ymm13, %ymm15
1159 vaddps %ymm15, %ymm7, %ymm7
1160
1161
1162 // unroll 2
1163 vmovaps 64(%r11), %ymm12 // A[0]
1164 vbroadcastss 8(%r12), %ymm13 // B[0]
1165 vmulps %ymm12, %ymm13, %ymm15
1166 vaddps %ymm15, %ymm0, %ymm0
1167 vbroadcastss 40(%r12), %ymm13 // B[1]
1168 vmulps %ymm12, %ymm13, %ymm15
1169 vaddps %ymm15, %ymm1, %ymm1
1170 vbroadcastss 72(%r12), %ymm13 // B[2]
1171 vmulps %ymm12, %ymm13, %ymm15
1172 vaddps %ymm15, %ymm2, %ymm2
1173 vbroadcastss 104(%r12), %ymm13 // B[3]
1174 vmulps %ymm12, %ymm13, %ymm15
1175 vaddps %ymm15, %ymm3, %ymm3
1176 vbroadcastss 136(%r12), %ymm13 // B[4]
1177 vmulps %ymm12, %ymm13, %ymm15
1178 vaddps %ymm15, %ymm4, %ymm4
1179 vbroadcastss 168(%r12), %ymm13 // B[5]
1180 vmulps %ymm12, %ymm13, %ymm15
1181 vaddps %ymm15, %ymm5, %ymm5
1182 vbroadcastss 200(%r12), %ymm13 // B[6]
1183 vmulps %ymm12, %ymm13, %ymm15
1184 vaddps %ymm15, %ymm6, %ymm6
1185 vbroadcastss 232(%r12), %ymm13 // B[7]
1186 vmulps %ymm12, %ymm13, %ymm15
1187 vaddps %ymm15, %ymm7, %ymm7
1188
1189
1190 // unroll 3
1191 vmovaps 96(%r11), %ymm12 // A[0]
1192 vbroadcastss 12(%r12), %ymm13 // B[0]
1193 vmulps %ymm12, %ymm13, %ymm15
1194 vaddps %ymm15, %ymm0, %ymm0
1195 vbroadcastss 44(%r12), %ymm13 // B[1]
1196 vmulps %ymm12, %ymm13, %ymm15
1197 vaddps %ymm15, %ymm1, %ymm1
1198 vbroadcastss 76(%r12), %ymm13 // B[2]
1199 vmulps %ymm12, %ymm13, %ymm15
1200 vaddps %ymm15, %ymm2, %ymm2
1201 vbroadcastss 108(%r12), %ymm13 // B[3]
1202 vmulps %ymm12, %ymm13, %ymm15
1203 vaddps %ymm15, %ymm3, %ymm3
1204 vbroadcastss 140(%r12), %ymm13 // B[4]
1205 vmulps %ymm12, %ymm13, %ymm15
1206 vaddps %ymm15, %ymm4, %ymm4
1207 vbroadcastss 172(%r12), %ymm13 // B[5]
1208 vmulps %ymm12, %ymm13, %ymm15
1209 vaddps %ymm15, %ymm5, %ymm5
1210 vbroadcastss 204(%r12), %ymm13 // B[6]
1211 vmulps %ymm12, %ymm13, %ymm15
1212 vaddps %ymm15, %ymm6, %ymm6
1213 vbroadcastss 236(%r12), %ymm13 // B[7]
1214 vmulps %ymm12, %ymm13, %ymm15
1215 vaddps %ymm15, %ymm7, %ymm7
1216
1217
1218 // unroll 4
1219 vmovaps 128(%r11), %ymm12 // A[0]
1220 vbroadcastss 16(%r12), %ymm13 // B[0]
1221 vmulps %ymm12, %ymm13, %ymm15
1222 vaddps %ymm15, %ymm0, %ymm0
1223 vbroadcastss 48(%r12), %ymm13 // B[1]
1224 vmulps %ymm12, %ymm13, %ymm15
1225 vaddps %ymm15, %ymm1, %ymm1
1226 vbroadcastss 80(%r12), %ymm13 // B[2]
1227 vmulps %ymm12, %ymm13, %ymm15
1228 vaddps %ymm15, %ymm2, %ymm2
1229 vbroadcastss 112(%r12), %ymm13 // B[3]
1230 vmulps %ymm12, %ymm13, %ymm15
1231 vaddps %ymm15, %ymm3, %ymm3
1232 vbroadcastss 144(%r12), %ymm13 // B[4]
1233 vmulps %ymm12, %ymm13, %ymm15
1234 vaddps %ymm15, %ymm4, %ymm4
1235 vbroadcastss 176(%r12), %ymm13 // B[5]
1236 vmulps %ymm12, %ymm13, %ymm15
1237 vaddps %ymm15, %ymm5, %ymm5
1238 vbroadcastss 208(%r12), %ymm13 // B[6]
1239 vmulps %ymm12, %ymm13, %ymm15
1240 vaddps %ymm15, %ymm6, %ymm6
1241 vbroadcastss 240(%r12), %ymm13 // B[7]
1242 vmulps %ymm12, %ymm13, %ymm15
1243 vaddps %ymm15, %ymm7, %ymm7
1244
1245
1246 // unroll 5
1247 vmovaps 160(%r11), %ymm12 // A[0]
1248 vbroadcastss 20(%r12), %ymm13 // B[0]
1249 vmulps %ymm12, %ymm13, %ymm15
1250 vaddps %ymm15, %ymm0, %ymm0
1251 vbroadcastss 52(%r12), %ymm13 // B[1]
1252 vmulps %ymm12, %ymm13, %ymm15
1253 vaddps %ymm15, %ymm1, %ymm1
1254 vbroadcastss 84(%r12), %ymm13 // B[2]
1255 vmulps %ymm12, %ymm13, %ymm15
1256 vaddps %ymm15, %ymm2, %ymm2
1257 vbroadcastss 116(%r12), %ymm13 // B[3]
1258 vmulps %ymm12, %ymm13, %ymm15
1259 vaddps %ymm15, %ymm3, %ymm3
1260 vbroadcastss 148(%r12), %ymm13 // B[4]
1261 vmulps %ymm12, %ymm13, %ymm15
1262 vaddps %ymm15, %ymm4, %ymm4
1263 vbroadcastss 180(%r12), %ymm13 // B[5]
1264 vmulps %ymm12, %ymm13, %ymm15
1265 vaddps %ymm15, %ymm5, %ymm5
1266 vbroadcastss 212(%r12), %ymm13 // B[6]
1267 vmulps %ymm12, %ymm13, %ymm15
1268 vaddps %ymm15, %ymm6, %ymm6
1269 vbroadcastss 244(%r12), %ymm13 // B[7]
1270 vmulps %ymm12, %ymm13, %ymm15
1271 vaddps %ymm15, %ymm7, %ymm7
1272
1273
1274 // unroll 6
1275 vmovaps 192(%r11), %ymm12 // A[0]
1276 vbroadcastss 24(%r12), %ymm13 // B[0]
1277 vmulps %ymm12, %ymm13, %ymm15
1278 vaddps %ymm15, %ymm0, %ymm0
1279 vbroadcastss 56(%r12), %ymm13 // B[1]
1280 vmulps %ymm12, %ymm13, %ymm15
1281 vaddps %ymm15, %ymm1, %ymm1
1282 vbroadcastss 88(%r12), %ymm13 // B[2]
1283 vmulps %ymm12, %ymm13, %ymm15
1284 vaddps %ymm15, %ymm2, %ymm2
1285 vbroadcastss 120(%r12), %ymm13 // B[3]
1286 vmulps %ymm12, %ymm13, %ymm15
1287 vaddps %ymm15, %ymm3, %ymm3
1288 vbroadcastss 152(%r12), %ymm13 // B[4]
1289 vmulps %ymm12, %ymm13, %ymm15
1290 vaddps %ymm15, %ymm4, %ymm4
1291 vbroadcastss 184(%r12), %ymm13 // B[5]
1292 vmulps %ymm12, %ymm13, %ymm15
1293 vaddps %ymm15, %ymm5, %ymm5
1294 vbroadcastss 216(%r12), %ymm13 // B[6]
1295 vmulps %ymm12, %ymm13, %ymm15
1296 vaddps %ymm15, %ymm6, %ymm6
1297 vbroadcastss 248(%r12), %ymm13 // B[7]
1298 vmulps %ymm12, %ymm13, %ymm15
1299 vaddps %ymm15, %ymm7, %ymm7
1300
1301
1302 // unroll 7
1303 vmovaps 224(%r11), %ymm12 // A[0]
1304 vbroadcastss 28(%r12), %ymm13 // B[0]
1305 vmulps %ymm12, %ymm13, %ymm15
1306 vaddps %ymm15, %ymm0, %ymm0
1307 vbroadcastss 60(%r12), %ymm13 // B[1]
1308 vmulps %ymm12, %ymm13, %ymm15
1309 vaddps %ymm15, %ymm1, %ymm1
1310 vbroadcastss 92(%r12), %ymm13 // B[2]
1311 vmulps %ymm12, %ymm13, %ymm15
1312 vaddps %ymm15, %ymm2, %ymm2
1313 vbroadcastss 124(%r12), %ymm13 // B[3]
1314 vmulps %ymm12, %ymm13, %ymm15
1315 vaddps %ymm15, %ymm3, %ymm3
1316 vbroadcastss 156(%r12), %ymm13 // B[4]
1317 vmulps %ymm12, %ymm13, %ymm15
1318 vaddps %ymm15, %ymm4, %ymm4
1319 vbroadcastss 188(%r12), %ymm13 // B[5]
1320 vmulps %ymm12, %ymm13, %ymm15
1321 vaddps %ymm15, %ymm5, %ymm5
1322 vbroadcastss 220(%r12), %ymm13 // B[6]
1323 vmulps %ymm12, %ymm13, %ymm15
1324 vaddps %ymm15, %ymm6, %ymm6
1325 vbroadcastss 252(%r12), %ymm13 // B[7]
1326 vmulps %ymm12, %ymm13, %ymm15
1327 vaddps %ymm15, %ymm7, %ymm7
1328
1329 subl $8, %r10d
1330 addq $256, %r11
1331
1332 mov %r14, %r12
1333 addq %r13, %r14
1334
1335 cmpl $7, %r10d
1336 jg 1b // main loop
1337
1338
13390: // consider clean1-up loop
1340
1341 cmpl $0, %r10d
1342 jle 2f // return
1343
13443: // clean1-up loop
1345
1346 // unroll 0
1347 vmovaps 0(%r11), %ymm12 // A[0]
1348 vbroadcastss 0(%r12), %ymm13 // B[0]
1349 vmulps %ymm12, %ymm13, %ymm15
1350 vaddps %ymm15, %ymm0, %ymm0
1351 vbroadcastss 32(%r12), %ymm13 // B[1]
1352 vmulps %ymm12, %ymm13, %ymm15
1353 vaddps %ymm15, %ymm1, %ymm1
1354 vbroadcastss 64(%r12), %ymm13 // B[2]
1355 vmulps %ymm12, %ymm13, %ymm15
1356 vaddps %ymm15, %ymm2, %ymm2
1357 vbroadcastss 96(%r12), %ymm13 // B[3]
1358 vmulps %ymm12, %ymm13, %ymm15
1359 vaddps %ymm15, %ymm3, %ymm3
1360 vbroadcastss 128(%r12), %ymm13 // B[4]
1361 vmulps %ymm12, %ymm13, %ymm15
1362 vaddps %ymm15, %ymm4, %ymm4
1363 vbroadcastss 160(%r12), %ymm13 // B[5]
1364 vmulps %ymm12, %ymm13, %ymm15
1365 vaddps %ymm15, %ymm5, %ymm5
1366 vbroadcastss 192(%r12), %ymm13 // B[6]
1367 vmulps %ymm12, %ymm13, %ymm15
1368 vaddps %ymm15, %ymm6, %ymm6
1369 vbroadcastss 224(%r12), %ymm13 // B[7]
1370 vmulps %ymm12, %ymm13, %ymm15
1371 vaddps %ymm15, %ymm7, %ymm7
1372
1373 subl $1, %r10d
1374 addq $32, %r11
1375 addq $4, %r12
1376
1377 cmpl $0, %r10d
1378 jg 3b // clean up loop
1379
1380
13812: // return
1382
1383#if MACRO_LEVEL>=2
1384 .endm
1385#else
1386 ret
1387
1388#if defined(OS_LINUX)
1389 .size inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
1390#endif
1391#endif
1392
1393
1394
1395
1396
1397// common inner routine with file scope
1398//
1399// edge for B unaligned
1400//
1401// input arguments:
1402// r10 <- k
1403// r11 <- A
1404// r12 <- B
1405// r13 <- bs*sdb*sizeof(double)
1406// r14 <- offB
1407// ymm0 <- []
1408// ymm1 <- []
1409// ymm2 <- []
1410// ymm3 <- []
1411// ymm8 <- dirty
1412// ymm12 <- dirty
1413// ymm15 <- dirty
1414
1415//
1416// output arguments:
1417// r10 <- k-(4-offB)
1418// r11 <- A+(4-offB)*bs*sizeof(double)
1419// r12 <- B-offB+bs*sdb*sizeof(double)
1420// r13 <- bs*sdb*sizeof(double)
1421// r14 <- offB
1422// ymm0 <- []
1423// ymm1 <- []
1424// ymm2 <- []
1425// ymm3 <- []
1426// ymm8 <- dirty
1427// ymm12 <- dirty
1428// ymm15 <- dirty
1429
1430
1431#if MACRO_LEVEL>=1
1432 .macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
1433#else
1434 .p2align 4,,15
1435#if defined(OS_LINUX)
1436 .type inner_edge_gemm_add_nn_8x8_lib8, @function
1437inner_edge_gemm_add_nn_8x8_lib8:
1438#elif defined(OS_MAC)
1439_inner_edge_gemm_add_nn_8x8_lib8:
1440#elif defined(OS_WINDOWS)
1441 .def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
1442inner_edge_gemm_add_nn_8x8_lib8:
1443#endif
1444#endif
1445
1446 cmpl $0, %r14d // offset==0
1447 jle 2f // end
1448
1449 cmpl $0, %r10d // k==0
1450 jle 2f // end
1451
1452 movl $8, %ebx
1453 subl %r14d, %ebx // 8-offsetB
1454 cmpl %r10d, %ebx
1455// jle 0f
1456// movl %r10d, %ebx // kend=min(k,8-offsetB)
1457//0:
1458 cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
1459
1460 movl %r14d, %eax
1461 sall $2, %eax // offsetB*sizeof(float)
1462 addq %rax, %r12 // B+offsetB*sizeof(float)
1463
1464 // unroll 0
1465 vmovaps 0(%r11), %ymm12 // A[0]
1466 vbroadcastss 0(%r12), %ymm13 // B[0]
1467 vmulps %ymm12, %ymm13, %ymm15
1468 vaddps %ymm15, %ymm0, %ymm0
1469 vbroadcastss 32(%r12), %ymm13 // B[1]
1470 vmulps %ymm12, %ymm13, %ymm15
1471 vaddps %ymm15, %ymm1, %ymm1
1472 vbroadcastss 64(%r12), %ymm13 // B[2]
1473 vmulps %ymm12, %ymm13, %ymm15
1474 vaddps %ymm15, %ymm2, %ymm2
1475 vbroadcastss 96(%r12), %ymm13 // B[3]
1476 vmulps %ymm12, %ymm13, %ymm15
1477 vaddps %ymm15, %ymm3, %ymm3
1478 vbroadcastss 128(%r12), %ymm13 // B[4]
1479 vmulps %ymm12, %ymm13, %ymm15
1480 vaddps %ymm15, %ymm4, %ymm4
1481 vbroadcastss 160(%r12), %ymm13 // B[5]
1482 vmulps %ymm12, %ymm13, %ymm15
1483 vaddps %ymm15, %ymm5, %ymm5
1484 vbroadcastss 192(%r12), %ymm13 // B[6]
1485 vmulps %ymm12, %ymm13, %ymm15
1486 vaddps %ymm15, %ymm6, %ymm6
1487 vbroadcastss 224(%r12), %ymm13 // B[7]
1488 vmulps %ymm12, %ymm13, %ymm15
1489 vaddps %ymm15, %ymm7, %ymm7
1490
1491 subl $1, %r10d // k-1
1492 subl $1, %ebx // kend-1
1493 addq $32, %r11 // A+1*bs*sizeof(float)
1494 addq $4, %r12 // B+1*sizeof(float)
1495
1496 cmpl $0, %ebx
1497 jg 1b
1498
1499 cmpl $0, %r10d
1500 jle 2f // end
1501
1502 addq %r13, %r12
1503 subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
1504
15052:
1506
1507#if MACRO_LEVEL>=1
1508 .endm
1509#else
1510 ret
1511
1512#if defined(OS_LINUX)
1513 .size inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
1514#endif
1515#endif
1516
1517
1518
1519
1520
1521// common inner routine with file scope
1522//
1523// strsm
1524// right
1525// lower
1526// transposed
1527// not-unit
1528//
1529// input arguments:
1530// r10 <- D
1531// r11 <- inv_diag_D
1532// r12d <- kn
1533// ymm0 <- []
1534// ymm1 <- []
1535// ymm2 <- []
1536// ymm3 <- []
1537// ymm4 <- []
1538// ymm5 <- []
1539// ymm6 <- []
1540// ymm7 <- []
1541// ymm12 <- dirty
1542// ymm13 <- dirty
1543// ymm14 <- dirty
1544// ymm15 <- dirty
1545//
1546// output arguments:
1547// r10 <- D
1548// r11 <- inv_diag_D
1549// r12d <- kn
1550// ymm0 <- []
1551// ymm1 <- []
1552// ymm2 <- []
1553// ymm3 <- []
1554// ymm4 <- []
1555// ymm5 <- []
1556// ymm6 <- []
1557// ymm7 <- []
1558// ymm12 <- dirty
1559// ymm13 <- dirty
1560// ymm14 <- dirty
1561// ymm15 <- dirty
1562
1563#if MACRO_LEVEL>=1
1564 .macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
1565#else
1566 .p2align 4,,15
1567#if defined(OS_LINUX)
1568 .type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
1569inner_edge_trsm_rlt_inv_8x8_vs_lib8:
1570#elif defined(OS_MAC)
1571_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
1572#elif defined(OS_WINDOWS)
1573 .def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
1574inner_edge_trsm_rlt_inv_8x8_vs_lib8:
1575#endif
1576#endif
1577
1578 vbroadcastss 0(%r11), %ymm13
1579 vmulps %ymm0, %ymm13, %ymm0
1580 vbroadcastss 4(%r10), %ymm13
1581 vmulps %ymm0, %ymm13, %ymm12
1582 vsubps %ymm12, %ymm1, %ymm1
1583 vbroadcastss 8(%r10), %ymm13
1584 vmulps %ymm0, %ymm13, %ymm12
1585 vsubps %ymm12, %ymm2, %ymm2
1586 vbroadcastss 12(%r10), %ymm13
1587 vmulps %ymm0, %ymm13, %ymm12
1588 vsubps %ymm12, %ymm3, %ymm3
1589 vbroadcastss 16(%r10), %ymm13
1590 vmulps %ymm0, %ymm13, %ymm12
1591 vsubps %ymm12, %ymm4, %ymm4
1592 vbroadcastss 20(%r10), %ymm13
1593 vmulps %ymm0, %ymm13, %ymm12
1594 vsubps %ymm12, %ymm5, %ymm5
1595 vbroadcastss 24(%r10), %ymm13
1596 vmulps %ymm0, %ymm13, %ymm12
1597 vsubps %ymm12, %ymm6, %ymm6
1598 vbroadcastss 28(%r10), %ymm13
1599 vmulps %ymm0, %ymm13, %ymm12
1600 vsubps %ymm12, %ymm7, %ymm7
1601
1602 vbroadcastss 4(%r11), %ymm13
1603 vmulps %ymm1, %ymm13, %ymm1
1604 vbroadcastss 40(%r10), %ymm13
1605 vmulps %ymm1, %ymm13, %ymm12
1606 vsubps %ymm12, %ymm2, %ymm2
1607 vbroadcastss 44(%r10), %ymm13
1608 vmulps %ymm1, %ymm13, %ymm12
1609 vsubps %ymm12, %ymm3, %ymm3
1610 vbroadcastss 48(%r10), %ymm13
1611 vmulps %ymm1, %ymm13, %ymm12
1612 vsubps %ymm12, %ymm4, %ymm4
1613 vbroadcastss 52(%r10), %ymm13
1614 vmulps %ymm1, %ymm13, %ymm12
1615 vsubps %ymm12, %ymm5, %ymm5
1616 vbroadcastss 56(%r10), %ymm13
1617 vmulps %ymm1, %ymm13, %ymm12
1618 vsubps %ymm12, %ymm6, %ymm6
1619 vbroadcastss 60(%r10), %ymm13
1620 vmulps %ymm1, %ymm13, %ymm12
1621 vsubps %ymm12, %ymm7, %ymm7
1622
1623 vbroadcastss 8(%r11), %ymm13
1624 vmulps %ymm2, %ymm13, %ymm2
1625 vbroadcastss 76(%r10), %ymm13
1626 vmulps %ymm2, %ymm13, %ymm12
1627 vsubps %ymm12, %ymm3, %ymm3
1628 vbroadcastss 80(%r10), %ymm13
1629 vmulps %ymm2, %ymm13, %ymm12
1630 vsubps %ymm12, %ymm4, %ymm4
1631 vbroadcastss 84(%r10), %ymm13
1632 vmulps %ymm2, %ymm13, %ymm12
1633 vsubps %ymm12, %ymm5, %ymm5
1634 vbroadcastss 88(%r10), %ymm13
1635 vmulps %ymm2, %ymm13, %ymm12
1636 vsubps %ymm12, %ymm6, %ymm6
1637 vbroadcastss 92(%r10), %ymm13
1638 vmulps %ymm2, %ymm13, %ymm12
1639 vsubps %ymm12, %ymm7, %ymm7
1640
1641 vbroadcastss 12(%r11), %ymm13
1642 vmulps %ymm3, %ymm13, %ymm3
1643 vbroadcastss 112(%r10), %ymm13
1644 vmulps %ymm3, %ymm13, %ymm12
1645 vsubps %ymm12, %ymm4, %ymm4
1646 vbroadcastss 116(%r10), %ymm13
1647 vmulps %ymm3, %ymm13, %ymm12
1648 vsubps %ymm12, %ymm5, %ymm5
1649 vbroadcastss 120(%r10), %ymm13
1650 vmulps %ymm3, %ymm13, %ymm12
1651 vsubps %ymm12, %ymm6, %ymm6
1652 vbroadcastss 124(%r10), %ymm13
1653 vmulps %ymm3, %ymm13, %ymm12
1654 vsubps %ymm12, %ymm7, %ymm7
1655
1656 vbroadcastss 16(%r11), %ymm13
1657 vmulps %ymm4, %ymm13, %ymm4
1658 cmpl $6, %r12d
1659 jl 0f // ret
1660 vbroadcastss 148(%r10), %ymm13
1661 vmulps %ymm4, %ymm13, %ymm12
1662 vsubps %ymm12, %ymm5, %ymm5
1663 vbroadcastss 152(%r10), %ymm13
1664 vmulps %ymm4, %ymm13, %ymm12
1665 vsubps %ymm12, %ymm6, %ymm6
1666 vbroadcastss 156(%r10), %ymm13
1667 vmulps %ymm4, %ymm13, %ymm12
1668 vsubps %ymm12, %ymm7, %ymm7
1669
1670 vbroadcastss 20(%r11), %ymm13
1671 vmulps %ymm5, %ymm13, %ymm5
1672 cmpl $7, %r12d
1673 jl 0f // ret
1674 vbroadcastss 184(%r10), %ymm13
1675 vmulps %ymm5, %ymm13, %ymm12
1676 vsubps %ymm12, %ymm6, %ymm6
1677 vbroadcastss 188(%r10), %ymm13
1678 vmulps %ymm5, %ymm13, %ymm12
1679 vsubps %ymm12, %ymm7, %ymm7
1680
1681 vbroadcastss 24(%r11), %ymm13
1682 vmulps %ymm6, %ymm13, %ymm6
1683 cmpl $8, %r12d
1684 jl 0f // ret
1685 vbroadcastss 220(%r10), %ymm13
1686 vmulps %ymm6, %ymm13, %ymm12
1687 vsubps %ymm12, %ymm7, %ymm7
1688
1689 vbroadcastss 28(%r11), %ymm13
1690 vmulps %ymm7, %ymm13, %ymm7
1691
16920:
1693
1694#if MACRO_LEVEL>=1
1695 .endm
1696#else
1697 ret
1698
1699#if defined(OS_LINUX)
1700 .size inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
1701#endif
1702#endif
1703
1704
1705
1706
1707
1708// common inner routine with file scope
1709//
1710// cholesky factorization
1711//
1712// input arguments:
1713// r10 <- inv_diag_E
1714// r11d <- kn
1715// ymm0 <- []
1716// ymm1 <- []
1717// ymm2 <- []
1718// ymm3 <- []
1719// ymm4 <- []
1720// ymm5 <- []
1721// ymm6 <- []
1722// ymm7 <- []
1723// ymm12 <- dirty
1724// ymm13 <- dirty
1725// ymm14 <- dirty
1726// ymm15 <- dirty
1727//
1728// output arguments:
1729// r10 <- inv_diag_E
1730// r11d <- kn
1731// ymm0 <- []
1732// ymm1 <- []
1733// ymm2 <- []
1734// ymm3 <- []
1735// ymm4 <- []
1736// ymm5 <- []
1737// ymm6 <- []
1738// ymm7 <- []
1739// ymm12 <- dirty
1740// ymm13 <- dirty
1741// ymm14 <- dirty
1742// ymm15 <- dirty
1743
1744#if MACRO_LEVEL>=1
1745 .macro INNER_EDGE_POTRF_8X8_VS_LIB8
1746#else
1747 .p2align 4,,15
1748#if defined(OS_LINUX)
1749 .type inner_edge_potrf_8x8_vs_lib8, @function
1750inner_edge_potrf_8x8_vs_lib8:
1751#elif defined(OS_MAC)
1752_inner_edge_potrf_8x8_vs_lib8:
1753#elif defined(OS_WINDOWS)
1754 .def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
1755inner_edge_potrf_8x8_vs_lib8:
1756#endif
1757#endif
1758
1759 vxorps %ymm15, %ymm15, %ymm15 // 0.0
1760#if defined(OS_LINUX) | defined(OS_WINDOWS)
1761 vmovss .LC03(%rip), %xmm14 // 1.0
1762#elif defined(OS_MAC)
1763 vmovss LC03(%rip), %xmm14 // 1.0
1764#endif
1765
1766 vmovss %xmm0, %xmm0, %xmm13
1767 vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
1768 jbe 1f
1769 vsqrtss %xmm13, %xmm13, %xmm13
1770 vdivss %xmm13, %xmm14, %xmm13
17712:
1772 vmovss %xmm13, 0(%r10)
1773 vpermilps $0x00, %xmm13, %xmm13
1774 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1775 vmulps %ymm0, %ymm13, %ymm0
1776 vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
1777 vpermilps $0x55, %ymm11, %ymm13
1778 vmulps %ymm0, %ymm13, %ymm12
1779 vsubps %ymm12, %ymm1, %ymm1
1780 vpermilps $0xaa, %ymm11, %ymm13
1781 vmulps %ymm0, %ymm13, %ymm12
1782 vsubps %ymm12, %ymm2, %ymm2
1783 vpermilps $0xff, %ymm11, %ymm13
1784 vmulps %ymm0, %ymm13, %ymm12
1785 vsubps %ymm12, %ymm3, %ymm3
1786 vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
1787 vpermilps $0x00, %ymm11, %ymm13
1788 vmulps %ymm0, %ymm13, %ymm12
1789 vsubps %ymm12, %ymm4, %ymm4
1790 vpermilps $0x55, %ymm11, %ymm13
1791 vmulps %ymm0, %ymm13, %ymm12
1792 vsubps %ymm12, %ymm5, %ymm5
1793 vpermilps $0xaa, %ymm11, %ymm13
1794 vmulps %ymm0, %ymm13, %ymm12
1795 vsubps %ymm12, %ymm6, %ymm6
1796 vpermilps $0xff, %ymm11, %ymm13
1797 vmulps %ymm0, %ymm13, %ymm12
1798 vsubps %ymm12, %ymm7, %ymm7
1799
1800
1801 vpermilps $0x55, %xmm1, %xmm13
1802 vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
1803 jbe 3f
1804 vsqrtss %xmm13, %xmm13, %xmm13
1805 vdivss %xmm13, %xmm14, %xmm13
18064:
1807 vmovss %xmm13, 4(%r10)
1808 vpermilps $0x00, %xmm13, %xmm13
1809 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1810 vmulps %ymm1, %ymm13, %ymm1
1811 vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
1812 vpermilps $0xaa, %ymm11, %ymm13
1813 vmulps %ymm1, %ymm13, %ymm12
1814 vsubps %ymm12, %ymm2, %ymm2
1815 vpermilps $0xff, %ymm11, %ymm13
1816 vmulps %ymm1, %ymm13, %ymm12
1817 vsubps %ymm12, %ymm3, %ymm3
1818 vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
1819 vpermilps $0x00, %ymm11, %ymm13
1820 vmulps %ymm1, %ymm13, %ymm12
1821 vsubps %ymm12, %ymm4, %ymm4
1822 vpermilps $0x55, %ymm11, %ymm13
1823 vmulps %ymm1, %ymm13, %ymm12
1824 vsubps %ymm12, %ymm5, %ymm5
1825 vpermilps $0xaa, %ymm11, %ymm13
1826 vmulps %ymm1, %ymm13, %ymm12
1827 vsubps %ymm12, %ymm6, %ymm6
1828 vpermilps $0xff, %ymm11, %ymm13
1829 vmulps %ymm1, %ymm13, %ymm12
1830 vsubps %ymm12, %ymm7, %ymm7
1831
1832
1833 vpermilps $0xaa, %xmm2, %xmm13
1834 vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
1835 jbe 5f
1836 vsqrtss %xmm13, %xmm13, %xmm13
1837 vdivss %xmm13, %xmm14, %xmm13
18386:
1839 vmovss %xmm13, 8(%r10)
1840 vpermilps $0x00, %xmm13, %xmm13
1841 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1842 vmulps %ymm2, %ymm13, %ymm2
1843 vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
1844 vpermilps $0xff, %ymm11, %ymm13
1845 vmulps %ymm2, %ymm13, %ymm12
1846 vsubps %ymm12, %ymm3, %ymm3
1847 vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
1848 vpermilps $0x00, %ymm11, %ymm13
1849 vmulps %ymm2, %ymm13, %ymm12
1850 vsubps %ymm12, %ymm4, %ymm4
1851 vpermilps $0x55, %ymm11, %ymm13
1852 vmulps %ymm2, %ymm13, %ymm12
1853 vsubps %ymm12, %ymm5, %ymm5
1854 vpermilps $0xaa, %ymm11, %ymm13
1855 vmulps %ymm2, %ymm13, %ymm12
1856 vsubps %ymm12, %ymm6, %ymm6
1857 vpermilps $0xff, %ymm11, %ymm13
1858 vmulps %ymm2, %ymm13, %ymm12
1859 vsubps %ymm12, %ymm7, %ymm7
1860
1861
1862 vpermilps $0xff, %xmm3, %xmm13
1863 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
1864 jbe 7f
1865 vsqrtss %xmm13, %xmm13, %xmm13
1866 vdivss %xmm13, %xmm14, %xmm13
18678:
1868 vmovsd %xmm13, 12(%r10)
1869 vpermilps $0x00, %xmm13, %xmm13
1870 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1871 vmulps %ymm3, %ymm13, %ymm3
1872 vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
1873 vpermilps $0x00, %ymm11, %ymm13
1874 vmulps %ymm3, %ymm13, %ymm12
1875 vsubps %ymm12, %ymm4, %ymm4
1876 vpermilps $0x55, %ymm11, %ymm13
1877 vmulps %ymm3, %ymm13, %ymm12
1878 vsubps %ymm12, %ymm5, %ymm5
1879 vpermilps $0xaa, %ymm11, %ymm13
1880 vmulps %ymm3, %ymm13, %ymm12
1881 vsubps %ymm12, %ymm6, %ymm6
1882 vpermilps $0xff, %ymm11, %ymm13
1883 vmulps %ymm3, %ymm13, %ymm12
1884 vsubps %ymm12, %ymm7, %ymm7
1885
1886
1887 vextractf128 $0x1, %ymm4, %xmm13
1888// vpermilps $0x00, %xmm13, %xmm13
1889 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
1890 jbe 9f
1891 vsqrtss %xmm13, %xmm13, %xmm13
1892 vdivss %xmm13, %xmm14, %xmm13
189310:
1894 vmovsd %xmm13, 16(%r10)
1895 vpermilps $0x00, %xmm13, %xmm13
1896 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1897 vmulps %ymm4, %ymm13, %ymm4
1898 cmpl $6, %r11d
1899 jl 0f // ret
1900 vperm2f128 $0x11, %ymm4, %ymm4, %ymm11
1901 vpermilps $0x55, %ymm11, %ymm13
1902 vmulps %ymm4, %ymm13, %ymm12
1903 vsubps %ymm12, %ymm5, %ymm5
1904 vpermilps $0xaa, %ymm11, %ymm13
1905 vmulps %ymm4, %ymm13, %ymm12
1906 vsubps %ymm12, %ymm6, %ymm6
1907 vpermilps $0xff, %ymm11, %ymm13
1908 vmulps %ymm4, %ymm13, %ymm12
1909 vsubps %ymm12, %ymm7, %ymm7
1910
1911
1912 vextractf128 $0x1, %ymm5, %xmm13
1913 vpermilps $0x55, %xmm13, %xmm13
1914 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
1915 jbe 11f
1916 vsqrtss %xmm13, %xmm13, %xmm13
1917 vdivss %xmm13, %xmm14, %xmm13
191812:
1919 vmovsd %xmm13, 20(%r10)
1920 vpermilps $0x00, %xmm13, %xmm13
1921 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1922 vmulps %ymm5, %ymm13, %ymm5
1923 cmpl $7, %r11d
1924 jl 0f // ret
1925 vperm2f128 $0x11, %ymm5, %ymm5, %ymm11
1926 vpermilps $0xaa, %ymm11, %ymm13
1927 vmulps %ymm5, %ymm13, %ymm12
1928 vsubps %ymm12, %ymm6, %ymm6
1929 vpermilps $0xff, %ymm11, %ymm13
1930 vmulps %ymm5, %ymm13, %ymm12
1931 vsubps %ymm12, %ymm7, %ymm7
1932
1933
1934 vextractf128 $0x1, %ymm6, %xmm13
1935 vpermilps $0xaa, %xmm13, %xmm13
1936 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
1937 jbe 13f
1938 vsqrtss %xmm13, %xmm13, %xmm13
1939 vdivss %xmm13, %xmm14, %xmm13
194014:
1941 vmovsd %xmm13, 24(%r10)
1942 vpermilps $0x00, %xmm13, %xmm13
1943 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1944 vmulps %ymm6, %ymm13, %ymm6
1945 cmpl $8, %r11d
1946 jl 0f // ret
1947 vperm2f128 $0x11, %ymm6, %ymm6, %ymm11
1948 vpermilps $0xff, %ymm11, %ymm13
1949 vmulps %ymm6, %ymm13, %ymm12
1950 vsubps %ymm12, %ymm7, %ymm7
1951
1952
1953 vextractf128 $0x1, %ymm7, %xmm13
1954 vpermilps $0xff, %xmm13, %xmm13
1955 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
1956 jbe 15f
1957 vsqrtss %xmm13, %xmm13, %xmm13
1958 vdivss %xmm13, %xmm14, %xmm13
195916:
1960 vmovsd %xmm13, 28(%r10)
1961 vpermilps $0x00, %xmm13, %xmm13
1962 vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
1963 vmulps %ymm7, %ymm13, %ymm7
1964
1965
1966 jmp 0f
1967
1968
19691:
1970 vxorps %ymm13, %ymm13, %ymm13
1971 jmp 2b
1972
19733:
1974 vxorpd %ymm13, %ymm13, %ymm13
1975 jmp 4b
1976
19775:
1978 vxorpd %ymm13, %ymm13, %ymm13
1979 jmp 6b
1980
19817:
1982 vxorpd %ymm13, %ymm13, %ymm13
1983 jmp 8b
1984
19859:
1986 vxorpd %ymm13, %ymm13, %ymm13
1987 jmp 10b
1988
198911:
1990 vxorpd %ymm13, %ymm13, %ymm13
1991 jmp 12b
1992
199313:
1994 vxorpd %ymm13, %ymm13, %ymm13
1995 jmp 14b
1996
199715:
1998 vxorpd %ymm13, %ymm13, %ymm13
1999 jmp 16b
2000
20010:
2002
2003#if MACRO_LEVEL>=1
2004 .endm
2005#else
2006 ret
2007
2008#if defined(OS_LINUX)
2009 .size inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
2010#endif
2011#endif
2012
2013
2014
2015
2016
2017// common inner routine with file scope
2018//
2019// scale for generic alpha and beta
2020//
2021// input arguments:
2022// r10 <- alpha
2023// r11 <- beta
2024// r12 <- C
2025// ymm0 <- []
2026// ymm1 <- []
2027// ymm2 <- []
2028// ymm3 <- []
2029// ymm4 <- []
2030// ymm5 <- []
2031// ymm6 <- []
2032// ymm7 <- []
2033// ymm8 <- dirty
2034// ymm9 <- dirty
2035// ymm10 <- dirty
2036// ymm11 <- dirty
2037// ymm15 <- dirty
2038//
2039// output arguments:
2040// r10 <- alpha
2041// r11 <- beta
2042// r12 <- C
2043// ymm0 <- []
2044// ymm1 <- []
2045// ymm2 <- []
2046// ymm3 <- []
2047// ymm4 <- []
2048// ymm5 <- []
2049// ymm6 <- []
2050// ymm7 <- []
2051// ymm8 <- dirty
2052// ymm9 <- dirty
2053// ymm10 <- dirty
2054// ymm11 <- dirty
2055// ymm15 <- dirty
2056
2057#if MACRO_LEVEL>=1
2058 .macro INNER_SCALE_AB_8X8_LIB8
2059#else
2060 .p2align 4,,15
2061#if defined(OS_LINUX)
2062 .type inner_scale_ab_8x8_lib8, @function
2063inner_scale_ab_8x8_lib8:
2064#elif defined(OS_MAC)
2065_inner_scale_ab_8x8_lib8:
2066#elif defined(OS_WINDOWS)
2067 .def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
2068inner_scale_ab_8x8_lib8:
2069#endif
2070#endif
2071
2072 // alpha
2073 vbroadcastss 0(%r10), %ymm11
2074
2075 vmulps %ymm0, %ymm11, %ymm0
2076 vmulps %ymm1, %ymm11, %ymm1
2077 vmulps %ymm2, %ymm11, %ymm2
2078 vmulps %ymm3, %ymm11, %ymm3
2079
2080 vmulps %ymm4, %ymm11, %ymm4
2081 vmulps %ymm5, %ymm11, %ymm5
2082 vmulps %ymm6, %ymm11, %ymm6
2083 vmulps %ymm7, %ymm11, %ymm7
2084
2085 // beta
2086 vbroadcastss 0(%r11), %ymm14
2087
2088 vxorps %ymm15, %ymm15, %ymm15 // 0.0
2089
2090 vucomiss %xmm15, %xmm14 // beta==0.0 ?
2091 je 0f // end
2092
2093 vmovaps 0(%r12), %ymm15
2094 vmulps %ymm15, %ymm14, %ymm15
2095 vaddps %ymm0, %ymm15, %ymm0
2096 vmovaps 32(%r12), %ymm15
2097 vmulps %ymm15, %ymm14, %ymm15
2098 vaddps %ymm1, %ymm15, %ymm1
2099 vmovaps 64(%r12), %ymm15
2100 vmulps %ymm15, %ymm14, %ymm15
2101 vaddps %ymm2, %ymm15, %ymm2
2102 vmovaps 96(%r12), %ymm15
2103 vmulps %ymm15, %ymm14, %ymm15
2104 vaddps %ymm3, %ymm15, %ymm3
2105 vmovaps 128(%r12), %ymm15
2106 vmulps %ymm15, %ymm14, %ymm15
2107 vaddps %ymm4, %ymm15, %ymm4
2108 vmovaps 160(%r12), %ymm15
2109 vmulps %ymm15, %ymm14, %ymm15
2110 vaddps %ymm5, %ymm15, %ymm5
2111 vmovaps 192(%r12), %ymm15
2112 vmulps %ymm15, %ymm14, %ymm15
2113 vaddps %ymm6, %ymm15, %ymm6
2114 vmovaps 224(%r12), %ymm15
2115 vmulps %ymm15, %ymm14, %ymm15
2116 vaddps %ymm7, %ymm15, %ymm7
2117
21180:
2119
2120#if MACRO_LEVEL>=1
2121 .endm
2122#else
2123 ret
2124
2125#if defined(OS_LINUX)
2126 .size inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
2127#endif
2128#endif
2129
2130
2131
2132
2133
2134// common inner routine with file scope
2135//
2136// scale for generic alpha and beta
2137//
2138// input arguments:
2139// r10 <- alpha
2140// r11 <- beta
2141// r12 <- offset
2142// r13 <- C
2143// r14 <- 4*sdc*sizeof(double)
2144// ymm0 <- []
2145// ymm1 <- []
2146// ymm2 <- []
2147// ymm3 <- []
2148// ymm4 <- []
2149// ymm5 <- []
2150// ymm6 <- []
2151// ymm7 <- []
2152// ymm8 <- dirty
2153// ymm9 <- dirty
2154// ymm10 <- dirty
2155// ymm11 <- dirty
2156// ymm15 <- dirty
2157//
2158// output arguments:
2159// r10 <- alpha
2160// r11 <- beta
2161// r12 <- offset
2162// r13 <- C
2163// r14 <- 4*sdc*sizeof(double)
2164// r15 <- n0 // col index: start from (inc)
2165// ymm0 <- []
2166// ymm1 <- []
2167// ymm2 <- []
2168// ymm3 <- []
2169// ymm4 <- []
2170// ymm5 <- []
2171// ymm6 <- []
2172// ymm7 <- []
2173// ymm8 <- dirty
2174// ymm9 <- dirty
2175// ymm10 <- dirty
2176// ymm11 <- dirty
2177// ymm15 <- dirty
2178
2179#if MACRO_LEVEL>=1
2180 .macro INNER_SCALE_AB_8X8_GEN_LIB8
2181#else
2182 .p2align 4,,15
2183#if defined(OS_LINUX)
2184 .type inner_scale_ab_8x8_gen_lib8, @function
2185inner_scale_ab_8x8_gen_lib8:
2186#elif defined(OS_MAC)
2187_inner_scale_ab_8x8_gen_lib8:
2188#elif defined(OS_WINDOWS)
2189 .def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
2190inner_scale_ab_8x8_gen_lib8:
2191#endif
2192#endif
2193
2194 // alpha
2195 vbroadcastss 0(%r10), %ymm11
2196
2197 vmulps %ymm0, %ymm11, %ymm0
2198 vmulps %ymm1, %ymm11, %ymm1
2199 vmulps %ymm2, %ymm11, %ymm2
2200 vmulps %ymm3, %ymm11, %ymm3
2201
2202 vmulps %ymm4, %ymm11, %ymm4
2203 vmulps %ymm5, %ymm11, %ymm5
2204 vmulps %ymm6, %ymm11, %ymm6
2205 vmulps %ymm7, %ymm11, %ymm7
2206
2207 // beta
2208 vbroadcastss 0(%r11), %ymm15
2209
2210 vxorps %ymm14, %ymm14, %ymm14 // 0.0
2211
2212 vucomiss %xmm15, %xmm14 // beta==0.0 ?
2213 je 3f // end
2214
2215 cmpl $0, %r12d
2216 jg 0f
2217
2218 // offset==0
2219
2220 vmovaps 0(%r13), %ymm12
2221 vmulps %ymm12, %ymm15, %ymm12
2222 vaddps %ymm0, %ymm12, %ymm0
2223 vmovaps 32(%r13), %ymm12
2224 vmulps %ymm12, %ymm15, %ymm12
2225 vaddps %ymm1, %ymm12, %ymm1
2226 vmovaps 64(%r13), %ymm12
2227 vmulps %ymm12, %ymm15, %ymm12
2228 vaddps %ymm2, %ymm12, %ymm2
2229 vmovaps 96(%r13), %ymm12
2230 vmulps %ymm12, %ymm15, %ymm12
2231 vaddps %ymm3, %ymm12, %ymm3
2232 vmovaps 128(%r13), %ymm15
2233 vmulps %ymm15, %ymm14, %ymm15
2234 vaddps %ymm4, %ymm15, %ymm4
2235 vmovaps 160(%r13), %ymm15
2236 vmulps %ymm15, %ymm14, %ymm15
2237 vaddps %ymm5, %ymm15, %ymm5
2238 vmovaps 192(%r13), %ymm15
2239 vmulps %ymm15, %ymm14, %ymm15
2240 vaddps %ymm6, %ymm15, %ymm6
2241 vmovaps 224(%r13), %ymm15
2242 vmulps %ymm15, %ymm14, %ymm15
2243 vaddps %ymm7, %ymm15, %ymm7
2244
2245 jmp 7f
2246
22470:
2248
2249 // offset > 0
2250 // 1 2 3 4 5 6 7
2251
2252 movq %r13, %r15 // C0
2253 addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
2254
2255 cmpl $4, %r10d
2256 jl 1f
2257 jg 2f
2258
2259 // offset==4
2260 // TODO
2261 jmp 7f
2262
22631:
2264 // 1 2 3
2265
2266 cmpl $2, %r10d
2267 jl 3f
2268 jg 4f
2269
2270 // offset==2
2271 // TODO
2272 jmp 7f
2273
22743:
2275 // offset==1
2276 // TODO
2277 jmp 7f
2278
22794:
2280 // offset==3
2281 // TODO
2282 jmp 7f
2283
22842:
2285 // 5 6 7
2286
2287 cmpl $6, %r10d
2288 jl 5f
2289 jg 6f
2290
2291 // offset==6
2292 // TODO
2293 jmp 7f
2294
22955:
2296 // offset==5
2297 // TODO
2298 jmp 7f
2299
23006:
2301 // offset==7
2302 // TODO
2303 jmp 7f
2304
2305 // end
23067:
2307
2308
2309#if MACRO_LEVEL>=1
2310 .endm
2311#else
2312 ret
2313
2314#if defined(OS_LINUX)
2315 .size inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
2316#endif
2317#endif
2318
2319
2320
2321
2322
2323// common inner routine with file scope
2324//
2325// blend for generic alpha and beta
2326//
2327// input arguments:
2328// r10 <- alpha
2329// r11 <- beta
2330// r12 <- C
2331// ymm0 <- []
2332// ymm1 <- []
2333// ymm2 <- []
2334// ymm3 <- []
2335// ymm4 <- []
2336// ymm5 <- []
2337// ymm6 <- []
2338// ymm7 <- []
2339// ymm8 <- dirty
2340// ymm9 <- dirty
2341// ymm10 <- dirty
2342// ymm11 <- dirty
2343// ymm15 <- dirty
2344//
2345// output arguments:
2346// r10 <- alpha
2347// r11 <- beta
2348// r12 <- C
2349// ymm0 <- []
2350// ymm1 <- []
2351// ymm2 <- []
2352// ymm3 <- []
2353// ymm4 <- []
2354// ymm5 <- []
2355// ymm6 <- []
2356// ymm7 <- []
2357// ymm8 <- dirty
2358// ymm9 <- dirty
2359// ymm10 <- dirty
2360// ymm11 <- dirty
2361// ymm15 <- dirty
2362
2363#if MACRO_LEVEL>=1
2364 .macro INNER_BLEND_SCALE_AB_8X8_LIB8
2365#else
2366 .p2align 4,,15
2367#if defined(OS_LINUX)
2368 .type inner_blend_scale_ab_8x8_lib8, @function
2369inner_blend_scale_ab_8x8_lib8:
2370#elif defined(OS_MAC)
2371_inner_blend_scale_ab_8x8_lib8:
2372#elif defined(OS_WINDOWS)
2373 .def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
2374inner_blend_scale_ab_8x8_lib8:
2375#endif
2376#endif
2377
2378 // alpha
2379 vbroadcastss 0(%r10), %ymm11
2380
2381 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2382 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2383 vblendps $0xaa, %ymm3, %ymm2, %ymm14
2384 vblendps $0x55, %ymm3, %ymm2, %ymm15
2385
2386 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2387 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2388 vblendps $0xcc, %ymm14, %ymm13, %ymm1
2389 vblendps $0x33, %ymm14, %ymm13, %ymm3
2390
2391 vmulps %ymm0, %ymm11, %ymm0
2392 vmulps %ymm1, %ymm11, %ymm1
2393 vmulps %ymm2, %ymm11, %ymm2
2394 vmulps %ymm3, %ymm11, %ymm3
2395
2396 vblendps $0xaa, %ymm5, %ymm4, %ymm12
2397 vblendps $0x55, %ymm5, %ymm4, %ymm13
2398 vblendps $0xaa, %ymm7, %ymm6, %ymm14
2399 vblendps $0x55, %ymm7, %ymm6, %ymm15
2400
2401 vblendps $0xcc, %ymm15, %ymm12, %ymm4
2402 vblendps $0x33, %ymm15, %ymm12, %ymm6
2403 vblendps $0xcc, %ymm14, %ymm13, %ymm5
2404 vblendps $0x33, %ymm14, %ymm13, %ymm7
2405
2406 vmulps %ymm4, %ymm11, %ymm4
2407 vmulps %ymm5, %ymm11, %ymm5
2408 vmulps %ymm6, %ymm11, %ymm6
2409 vmulps %ymm7, %ymm11, %ymm7
2410
2411 // beta
2412 vbroadcastss 0(%r11), %ymm14
2413
2414 vxorps %ymm15, %ymm15, %ymm15 // 0.0
2415
2416 vucomiss %xmm15, %xmm14 // beta==0.0 ?
2417 je 0f // end
2418
2419 vmovaps 0(%r12), %ymm15
2420 vmulps %ymm15, %ymm14, %ymm15
2421 vaddps %ymm0, %ymm15, %ymm0
2422 vmovaps 32(%r12), %ymm15
2423 vmulps %ymm15, %ymm14, %ymm15
2424 vaddps %ymm1, %ymm15, %ymm1
2425 vmovaps 64(%r12), %ymm15
2426 vmulps %ymm15, %ymm14, %ymm15
2427 vaddps %ymm2, %ymm15, %ymm2
2428 vmovaps 96(%r12), %ymm15
2429 vmulps %ymm15, %ymm14, %ymm15
2430 vaddps %ymm3, %ymm15, %ymm3
2431 vmovaps 128(%r12), %ymm15
2432 vmulps %ymm15, %ymm14, %ymm15
2433 vaddps %ymm4, %ymm15, %ymm4
2434 vmovaps 160(%r12), %ymm15
2435 vmulps %ymm15, %ymm14, %ymm15
2436 vaddps %ymm5, %ymm15, %ymm5
2437 vmovaps 192(%r12), %ymm15
2438 vmulps %ymm15, %ymm14, %ymm15
2439 vaddps %ymm6, %ymm15, %ymm6
2440 vmovaps 224(%r12), %ymm15
2441 vmulps %ymm15, %ymm14, %ymm15
2442 vaddps %ymm7, %ymm15, %ymm7
2443
24440:
2445
2446#if MACRO_LEVEL>=1
2447 .endm
2448#else
2449 ret
2450
2451#if defined(OS_LINUX)
2452 .size inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
2453#endif
2454#endif
2455
2456
2457
2458
2459
2460// common inner routine with file scope
2461//
2462// blend scale for generic alpha and beta
2463//
2464// input arguments:
2465// r10 <- alpha
2466// r11 <- beta
2467// r12 <- offset
2468// r13 <- C
2469// r14 <- 4*sdc*sizeof(double)
2470// ymm0 <- []
2471// ymm1 <- []
2472// ymm2 <- []
2473// ymm3 <- []
2474// ymm4 <- []
2475// ymm5 <- []
2476// ymm6 <- []
2477// ymm7 <- []
2478// ymm8 <- dirty
2479// ymm9 <- dirty
2480// ymm10 <- dirty
2481// ymm11 <- dirty
2482// ymm15 <- dirty
2483//
2484// output arguments:
2485// r10 <- alpha
2486// r11 <- beta
2487// r12 <- offset
2488// r13 <- C
2489// r14 <- 4*sdc*sizeof(double)
2490// r15 <- n0 // col index: start from (inc)
2491// ymm0 <- []
2492// ymm1 <- []
2493// ymm2 <- []
2494// ymm3 <- []
2495// ymm4 <- []
2496// ymm5 <- []
2497// ymm6 <- []
2498// ymm7 <- []
2499// ymm8 <- dirty
2500// ymm9 <- dirty
2501// ymm10 <- dirty
2502// ymm11 <- dirty
2503// ymm15 <- dirty
2504
2505#if MACRO_LEVEL>=1
2506 .macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
2507#else
2508 .p2align 4,,15
2509#if defined(OS_LINUX)
2510 .type inner_blend_scale_ab_8x8_gen_lib8, @function
2511inner_blend_scale_ab_8x8_gen_lib8:
2512#elif defined(OS_MAC)
2513_inner_blend_scale_ab_8x8_gen_lib8:
2514#elif defined(OS_WINDOWS)
2515 .def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
2516inner_blend_scale_ab_8x8_gen_lib8:
2517#endif
2518#endif
2519
2520 // alpha
2521 vbroadcastss 0(%r10), %ymm11
2522
2523 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2524 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2525 vblendps $0xaa, %ymm3, %ymm2, %ymm14
2526 vblendps $0x55, %ymm3, %ymm2, %ymm15
2527
2528 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2529 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2530 vblendps $0xcc, %ymm14, %ymm13, %ymm1
2531 vblendps $0x33, %ymm14, %ymm13, %ymm3
2532
2533 vmulps %ymm0, %ymm11, %ymm0
2534 vmulps %ymm1, %ymm11, %ymm1
2535 vmulps %ymm2, %ymm11, %ymm2
2536 vmulps %ymm3, %ymm11, %ymm3
2537
2538 vblendps $0xaa, %ymm5, %ymm4, %ymm12
2539 vblendps $0x55, %ymm5, %ymm4, %ymm13
2540 vblendps $0xaa, %ymm7, %ymm6, %ymm14
2541 vblendps $0x55, %ymm7, %ymm6, %ymm15
2542
2543 vblendps $0xcc, %ymm15, %ymm12, %ymm4
2544 vblendps $0x33, %ymm15, %ymm12, %ymm6
2545 vblendps $0xcc, %ymm14, %ymm13, %ymm5
2546 vblendps $0x33, %ymm14, %ymm13, %ymm7
2547
2548 vmulps %ymm4, %ymm11, %ymm4
2549 vmulps %ymm5, %ymm11, %ymm5
2550 vmulps %ymm6, %ymm11, %ymm6
2551 vmulps %ymm7, %ymm11, %ymm7
2552
2553 // beta
2554 vbroadcastss 0(%r11), %ymm15
2555
2556 vxorps %ymm14, %ymm14, %ymm14 // 0.0
2557
2558 vucomiss %xmm15, %xmm14 // beta==0.0 ?
2559 je 3f // end
2560
2561 cmpl $0, %r12d
2562 jg 0f
2563
2564 // offset==0
2565
2566 vmovaps 0(%r13), %ymm12
2567 vmulps %ymm12, %ymm15, %ymm12
2568 vaddps %ymm0, %ymm12, %ymm0
2569 vmovaps 32(%r13), %ymm12
2570 vmulps %ymm12, %ymm15, %ymm12
2571 vaddps %ymm1, %ymm12, %ymm1
2572 vmovaps 64(%r13), %ymm12
2573 vmulps %ymm12, %ymm15, %ymm12
2574 vaddps %ymm2, %ymm12, %ymm2
2575 vmovaps 96(%r13), %ymm12
2576 vmulps %ymm12, %ymm15, %ymm12
2577 vaddps %ymm3, %ymm12, %ymm3
2578 vmovaps 128(%r13), %ymm15
2579 vmulps %ymm15, %ymm14, %ymm15
2580 vaddps %ymm4, %ymm15, %ymm4
2581 vmovaps 160(%r13), %ymm15
2582 vmulps %ymm15, %ymm14, %ymm15
2583 vaddps %ymm5, %ymm15, %ymm5
2584 vmovaps 192(%r13), %ymm15
2585 vmulps %ymm15, %ymm14, %ymm15
2586 vaddps %ymm6, %ymm15, %ymm6
2587 vmovaps 224(%r13), %ymm15
2588 vmulps %ymm15, %ymm14, %ymm15
2589 vaddps %ymm7, %ymm15, %ymm7
2590
2591 jmp 7f
2592
25930:
2594
2595 // offset > 0
2596 // 1 2 3 4 5 6 7
2597
2598 movq %r13, %r15 // C0
2599 addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
2600
2601 cmpl $4, %r10d
2602 jl 1f
2603 jg 2f
2604
2605 // offset==4
2606 // TODO
2607 jmp 7f
2608
26091:
2610 // 1 2 3
2611
2612 cmpl $2, %r10d
2613 jl 3f
2614 jg 4f
2615
2616 // offset==2
2617 // TODO
2618 jmp 7f
2619
26203:
2621 // offset==1
2622 // TODO
2623 jmp 7f
2624
26254:
2626 // offset==3
2627 // TODO
2628 jmp 7f
2629
26302:
2631 // 5 6 7
2632
2633 cmpl $6, %r10d
2634 jl 5f
2635 jg 6f
2636
2637 // offset==6
2638 // TODO
2639 jmp 7f
2640
26415:
2642 // offset==5
2643 // TODO
2644 jmp 7f
2645
26466:
2647 // offset==7
2648 // TODO
2649 jmp 7f
2650
2651 // end
26527:
2653
2654
2655#if MACRO_LEVEL>=1
2656 .endm
2657#else
2658 ret
2659
2660#if defined(OS_LINUX)
2661 .size inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
2662#endif
2663#endif
2664
2665
2666
2667
2668
2669// common inner routine with file scope
2670//
2671// blend for generic alpha=1.0 and beta=1.0
2672//
2673// input arguments:
2674// r10 <- C
2675// ymm0 <- []
2676// ymm1 <- []
2677// ymm2 <- []
2678// ymm3 <- []
2679// ymm4 <- []
2680// ymm5 <- []
2681// ymm6 <- []
2682// ymm7 <- []
2683// ymm8 <- dirty
2684// ymm9 <- dirty
2685// ymm10 <- dirty
2686// ymm11 <- dirty
2687// ymm15 <- dirty
2688//
2689// output arguments:
2690// r10 <- C
2691// ymm0 <- []
2692// ymm1 <- []
2693// ymm2 <- []
2694// ymm3 <- []
2695// ymm4 <- []
2696// ymm5 <- []
2697// ymm6 <- []
2698// ymm7 <- []
2699// ymm8 <- dirty
2700// ymm9 <- dirty
2701// ymm10 <- dirty
2702// ymm11 <- dirty
2703// ymm15 <- dirty
2704
2705#if MACRO_LEVEL>=1
2706 .macro INNER_BLEND_SCALE_11_8X8_LIB8
2707#else
2708 .p2align 4,,15
2709#if defined(OS_LINUX)
2710 .type inner_blend_scale_11_8x8_lib8, @function
2711inner_blend_scale_11_8x8_lib8:
2712#elif defined(OS_MAC)
2713_inner_blend_scale_11_8x8_lib8:
2714#elif defined(OS_WINDOWS)
2715 .def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
2716inner_blend_scale_11_8x8_lib8:
2717#endif
2718#endif
2719
2720 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2721 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2722 vblendps $0xaa, %ymm3, %ymm2, %ymm14
2723 vblendps $0x55, %ymm3, %ymm2, %ymm15
2724
2725 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2726 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2727 vblendps $0xcc, %ymm14, %ymm13, %ymm1
2728 vblendps $0x33, %ymm14, %ymm13, %ymm3
2729
2730 vblendps $0xaa, %ymm5, %ymm4, %ymm12
2731 vblendps $0x55, %ymm5, %ymm4, %ymm13
2732 vblendps $0xaa, %ymm7, %ymm6, %ymm14
2733 vblendps $0x55, %ymm7, %ymm6, %ymm15
2734
2735 vblendps $0xcc, %ymm15, %ymm12, %ymm4
2736 vblendps $0x33, %ymm15, %ymm12, %ymm6
2737 vblendps $0xcc, %ymm14, %ymm13, %ymm5
2738 vblendps $0x33, %ymm14, %ymm13, %ymm7
2739
2740 vmovaps 0(%r10), %ymm15
2741 vaddps %ymm0, %ymm15, %ymm0
2742 vmovaps 32(%r10), %ymm15
2743 vaddps %ymm1, %ymm15, %ymm1
2744 vmovaps 64(%r10), %ymm15
2745 vaddps %ymm2, %ymm15, %ymm2
2746 vmovaps 96(%r10), %ymm15
2747 vaddps %ymm3, %ymm15, %ymm3
2748 vmovaps 128(%r10), %ymm15
2749 vaddps %ymm4, %ymm15, %ymm4
2750 vmovaps 160(%r10), %ymm15
2751 vaddps %ymm5, %ymm15, %ymm5
2752 vmovaps 192(%r10), %ymm15
2753 vaddps %ymm6, %ymm15, %ymm6
2754 vmovaps 224(%r10), %ymm15
2755 vaddps %ymm7, %ymm15, %ymm7
2756
2757#if MACRO_LEVEL>=1
2758 .endm
2759#else
2760 ret
2761
2762#if defined(OS_LINUX)
2763 .size inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
2764#endif
2765#endif
2766
2767
2768
2769
2770
2771// common inner routine with file scope
2772//
2773// blend scale for generic alpha and beta
2774//
2775// input arguments:
2776// r10 <- offset
2777// r11 <- C
2778// r12 <- 4*sdc*sizeof(double)
2779// ymm0 <- []
2780// ymm1 <- []
2781// ymm2 <- []
2782// ymm3 <- []
2783// ymm4 <- []
2784// ymm5 <- []
2785// ymm6 <- []
2786// ymm7 <- []
2787// ymm8 <- dirty
2788// ymm9 <- dirty
2789// ymm10 <- dirty
2790// ymm11 <- dirty
2791// ymm15 <- dirty
2792//
2793// output arguments:
2794// r10 <- offset
2795// r11 <- C
2796// r12 <- 4*sdc*sizeof(double)
2797// ymm0 <- []
2798// ymm1 <- []
2799// ymm2 <- []
2800// ymm3 <- []
2801// ymm4 <- []
2802// ymm5 <- []
2803// ymm6 <- []
2804// ymm7 <- []
2805// ymm8 <- dirty
2806// ymm9 <- dirty
2807// ymm10 <- dirty
2808// ymm11 <- dirty
2809// ymm15 <- dirty
2810
2811#if MACRO_LEVEL>=1
2812 .macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
2813#else
2814 .p2align 4,,15
2815#if defined(OS_LINUX)
2816 .type inner_blend_scale_11_8x8_gen_lib8, @function
2817inner_blend_scale_11_8x8_gen_lib8:
2818#elif defined(OS_MAC)
2819_inner_blend_scale_11_8x8_gen_lib8:
2820#elif defined(OS_WINDOWS)
2821 .def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
2822inner_blend_scale_11_8x8_gen_lib8:
2823#endif
2824#endif
2825
2826 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2827 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2828 vblendps $0xaa, %ymm3, %ymm2, %ymm14
2829 vblendps $0x55, %ymm3, %ymm2, %ymm15
2830
2831 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2832 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2833 vblendps $0xcc, %ymm14, %ymm13, %ymm1
2834 vblendps $0x33, %ymm14, %ymm13, %ymm3
2835
2836 vblendps $0xaa, %ymm5, %ymm4, %ymm12
2837 vblendps $0x55, %ymm5, %ymm4, %ymm13
2838 vblendps $0xaa, %ymm7, %ymm6, %ymm14
2839 vblendps $0x55, %ymm7, %ymm6, %ymm15
2840
2841 vblendps $0xcc, %ymm15, %ymm12, %ymm4
2842 vblendps $0x33, %ymm15, %ymm12, %ymm6
2843 vblendps $0xcc, %ymm14, %ymm13, %ymm5
2844 vblendps $0x33, %ymm14, %ymm13, %ymm7
2845
2846 cmpl $0, %r10d
2847 jg 0f
2848
2849 // offset==0
2850
2851 vmovaps 0(%r11), %ymm12
2852 vaddps %ymm0, %ymm12, %ymm0
2853 vmovaps 32(%r11), %ymm12
2854 vaddps %ymm1, %ymm12, %ymm1
2855 vmovaps 64(%r11), %ymm12
2856 vaddps %ymm2, %ymm12, %ymm2
2857 vmovaps 96(%r11), %ymm12
2858 vaddps %ymm3, %ymm12, %ymm3
2859 vmovaps 128(%r11), %ymm12
2860 vaddps %ymm4, %ymm12, %ymm4
2861 vmovaps 160(%r11), %ymm12
2862 vaddps %ymm5, %ymm12, %ymm5
2863 vmovaps 192(%r11), %ymm12
2864 vaddps %ymm6, %ymm12, %ymm6
2865 vmovaps 224(%r11), %ymm12
2866 vaddps %ymm7, %ymm12, %ymm7
2867
2868 jmp 7f
2869
28700:
2871
2872 // offset > 0
2873 // 1 2 3 4 5 6 7
2874
2875 movq %r13, %r15 // C0
2876 addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
2877
2878 cmpl $4, %r10d
2879 jl 1f
2880 jg 2f
2881
2882 // offset==4
2883 // TODO
2884 jmp 7f
2885
28861:
2887 // 1 2 3
2888
2889 cmpl $2, %r10d
2890 jl 3f
2891 jg 4f
2892
2893 // offset==2
2894 // TODO
2895 jmp 7f
2896
28973:
2898 // offset==1
2899 // TODO
2900 jmp 7f
2901
29024:
2903 // offset==3
2904 // TODO
2905 jmp 7f
2906
29072:
2908 // 5 6 7
2909
2910 cmpl $6, %r10d
2911 jl 5f
2912 jg 6f
2913
2914 // offset==6
2915 // TODO
2916 jmp 7f
2917
29185:
2919 // offset==5
2920 // TODO
2921 jmp 7f
2922
29236:
2924 // offset==7
2925 // TODO
2926 jmp 7f
2927
2928 // end
29297:
2930
2931
2932#if MACRO_LEVEL>=1
2933 .endm
2934#else
2935 ret
2936
2937#if defined(OS_LINUX)
2938 .size inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
2939#endif
2940#endif
2941
2942
2943
2944
2945
2946// common inner routine with file scope
2947//
2948// store n
2949//
2950// input arguments:
2951// r10 <- D
2952// ymm0 <- []
2953// ymm1 <- []
2954// ymm2 <- []
2955// ymm3 <- []
2956//
2957// output arguments:
2958// r10 <- D
2959// ymm0 <- []
2960// ymm1 <- []
2961// ymm2 <- []
2962// ymm3 <- []
2963
2964#if MACRO_LEVEL>=1
2965 .macro INNER_STORE_8X8_LIB8
2966#else
2967 .p2align 4,,15
2968#if defined(OS_LINUX)
2969 .type inner_store_8x8_lib8, @function
2970inner_store_8x8_lib8:
2971#elif defined(OS_MAC)
2972_inner_store_8x8_lib8:
2973#elif defined(OS_WINDOWS)
2974 .def inner_store_8x8_lib8; .scl 2; .type 32; .endef
2975inner_store_8x8_lib8:
2976#endif
2977#endif
2978
2979 vmovaps %ymm0, 0(%r10)
2980 vmovaps %ymm1, 32(%r10)
2981 vmovaps %ymm2, 64(%r10)
2982 vmovaps %ymm3, 96(%r10)
2983 vmovaps %ymm4, 128(%r10)
2984 vmovaps %ymm5, 160(%r10)
2985 vmovaps %ymm6, 192(%r10)
2986 vmovaps %ymm7, 224(%r10)
2987
2988#if MACRO_LEVEL>=1
2989 .endm
2990#else
2991 ret
2992
2993#if defined(OS_LINUX)
2994 .size inner_store_8x8_lib8, .-inner_store_8x8_lib8
2995#endif
2996#endif
2997
2998
2999
3000
3001
3002// common inner routine with file scope
3003//
3004// store n vs
3005//
3006// input arguments:
3007// r10 <- D
3008// r11 <- km
3009// r12 <- kn
3010// ymm0 <- []
3011// ymm1 <- []
3012// ymm2 <- []
3013// ymm3 <- []
3014// ymm4 <- []
3015// ymm5 <- []
3016// ymm6 <- []
3017// ymm7 <- []
3018//
3019// output arguments:
3020// r10 <- D
3021// r11 <- km
3022// r12 <- kn
3023// ymm0 <- []
3024// ymm1 <- []
3025// ymm2 <- []
3026// ymm3 <- []
3027// ymm4 <- []
3028// ymm5 <- []
3029// ymm6 <- []
3030// ymm7 <- []
3031
3032#if MACRO_LEVEL>=1
3033 .macro INNER_STORE_8X8_VS_LIB8
3034#else
3035 .p2align 4,,15
3036#if defined(OS_LINUX)
3037 .type inner_store_8x8_vs_lib8, @function
3038inner_store_8x8_vs_lib8:
3039#elif defined(OS_MAC)
3040_inner_store_8x8_vs_lib8:
3041#elif defined(OS_WINDOWS)
3042 .def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
3043inner_store_8x8_vs_lib8:
3044#endif
3045#endif
3046
3047 // compute mask for rows
3048 vcvtsi2ss %r11d, %xmm15, %xmm15
3049#if defined(OS_LINUX) | defined(OS_WINDOWS)
3050 vmovups .LC00(%rip), %ymm12
3051#elif defined(OS_MAC)
3052 vmovups LC00(%rip), %ymm12
3053#endif
3054 vshufps $0x00, %xmm15, %xmm15, %xmm15
3055 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
3056 vsubps %ymm15, %ymm12, %ymm15
3057
3058 vmaskmovps %ymm0, %ymm15, 0(%r10)
3059 vmaskmovps %ymm1, %ymm15, 32(%r10)
3060 vmaskmovps %ymm2, %ymm15, 64(%r10)
3061 vmaskmovps %ymm3, %ymm15, 96(%r10)
3062 vmaskmovps %ymm4, %ymm15, 128(%r10)
3063 cmpl $6, %r12d
3064 jl 0f // end
3065 vmaskmovps %ymm5, %ymm15, 160(%r10)
3066 cmpl $7, %r12d
3067 jl 0f // end
3068 vmaskmovps %ymm6, %ymm15, 192(%r10)
3069 je 0f // end
3070 vmaskmovps %ymm7, %ymm15, 224(%r10)
3071
30720:
3073
3074#if MACRO_LEVEL>=1
3075 .endm
3076#else
3077 ret
3078
3079#if defined(OS_LINUX)
3080 .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
3081#endif
3082#endif
3083
3084
3085
3086
3087
3088// common inner routine with file scope
3089//
3090// store n generalized
3091//
3092// input arguments:
3093// r10 <- offset
3094// r11 <- D
3095// r12 <- 4*sdd*sizeof(double)
3096// r13 <- m0 // row index: start from (inc)
3097// r14 <- m1 // row index: up to (exc)
3098// r15 <- n0 // col index: start from (inc)
3099// rax <- n1 // col index: up to (exc)
3100// rbx <- dirty
3101// ymm0 <- []
3102// ymm1 <- []
3103// ymm2 <- []
3104// ymm3 <- []
3105// ymm4 <- []
3106// ymm5 <- []
3107// ymm6 <- []
3108// ymm7 <- []
3109//
3110// output arguments:
3111// r10 <- offset
3112// r11 <- D
3113// r12 <- 4*sdd*sizeof(double)
3114// r13 <- m0 // row index: start from (inc)
3115// r14 <- m1 // row index: up to (exc)
3116// r15 <- n1-n0
3117// rax <- n1-n0
3118// rbx <- dirty
3119// ymm0 <- []
3120// ymm1 <- []
3121// ymm2 <- []
3122// ymm3 <- []
3123// ymm4 <- []
3124// ymm5 <- []
3125// ymm6 <- []
3126// ymm7 <- []
3127
3128#if MACRO_LEVEL>=1
3129 .macro INNER_STORE_8X8_GEN_LIB8
3130#else
3131 .p2align 4,,15
3132#if defined(OS_LINUX)
3133 .type inner_store_8x8_gen_lib8, @function
3134inner_store_8x8_gen_lib8:
3135#elif defined(OS_MAC)
3136_inner_store_8x8_gen_lib8:
3137#elif defined(OS_WINDOWS)
3138 .def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
3139inner_store_8x8_gen_lib8:
3140#endif
3141#endif
3142
3143 // compute mask for rows
3144 vcvtsi2ss %r13d, %xmm14, %xmm14
3145 vcvtsi2ss %r14d, %xmm15, %xmm15
3146#if defined(OS_LINUX) | defined(OS_WINDOWS)
3147 vmovups .LC00(%rip), %ymm12
3148#elif defined(OS_MAC)
3149 vmovups LC00(%rip), %ymm12
3150#endif
3151 vshufps $0x00, %xmm14, %xmm14, %xmm14
3152 vshufps $0x00, %xmm15, %xmm15, %xmm15
3153 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
3154 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
3155 vsubps %ymm12, %ymm14, %ymm14
3156 vsubps %ymm15, %ymm12, %ymm15
3157 vandps %ymm14, %ymm15, %ymm15
3158
3159 // shift D and sol for cols
3160 cmpl $0, %r15d
3161 jle 0f
3162
3163 vmovaps %ymm1, %ymm0
3164 vmovaps %ymm2, %ymm1
3165 vmovaps %ymm3, %ymm2
3166 vmovaps %ymm4, %ymm3
3167 vmovaps %ymm5, %ymm4
3168 vmovaps %ymm6, %ymm5
3169 vmovaps %ymm7, %ymm6
3170 addq $32, %r11
3171
3172 cmpl $1, %r15d
3173 jle 0f
3174
3175 vmovaps %ymm1, %ymm0
3176 vmovaps %ymm2, %ymm1
3177 vmovaps %ymm3, %ymm2
3178 vmovaps %ymm4, %ymm3
3179 vmovaps %ymm5, %ymm4
3180 vmovaps %ymm6, %ymm5
3181 addq $32, %r11
3182
3183 cmpl $2, %r15d
3184 jle 0f
3185
3186 vmovaps %ymm1, %ymm0
3187 vmovaps %ymm3, %ymm2
3188 vmovaps %ymm4, %ymm3
3189 vmovaps %ymm5, %ymm4
3190 addq $32, %r11
3191
31920:
3193
3194 // compute number of cols
3195 cmpl $8, %eax
3196 jle 0f
3197 movl $8, %eax
31980:
3199 subl %r15d, %eax
3200 movl %eax, %r15d
3201
3202 cmpl $0, %r10d
3203 jg 0f
3204
3205 // offset==0
3206 vmaskmovps %ymm0, %ymm15, 0(%r11)
3207 vmaskmovps %ymm1, %ymm15, 32(%r11)
3208 vmaskmovps %ymm2, %ymm15, 64(%r11)
3209 vmaskmovps %ymm3, %ymm15, 96(%r11)
3210 vmaskmovps %ymm4, %ymm15, 128(%r11)
3211 cmpl $6, %r15d
3212 jl 7f // end
3213 vmaskmovps %ymm5, %ymm15, 160(%r11)
3214 cmpl $7, %r15d
3215 jl 7f // end
3216 vmaskmovps %ymm6, %ymm15, 192(%r11)
3217 je 7f // end
3218 vmaskmovps %ymm7, %ymm15, 224(%r11)
3219 //
3220 jmp 7f
3221
32220:
3223 // offset > 0
3224 // 1 2 3 4 5 6 7
3225
3226 movq %r11, %rbx // D0
3227 addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
3228
3229 cmpl $4, %r10d
3230 jl 1f
3231 jg 2f
3232
3233 // offset==4
3234 // TODO
3235 jmp 7f
3236
32371:
3238 // 1 2 3
3239
3240 cmpl $2, %r10d
3241 jl 3f
3242 jg 4f
3243
3244 // offset==2
3245 // TODO
3246 jmp 7f
3247
32483:
3249 // offset==1
3250 // TODO
3251 jmp 7f
3252
32534:
3254 // offset==3
3255 // TODO
3256 jmp 7f
3257
32582:
3259 // 5 6 7
3260
3261 cmpl $6, %r10d
3262 jl 5f
3263 jg 6f
3264
3265 // offset==6
3266 // TODO
3267 jmp 7f
3268
32695:
3270 // offset==5
3271 // TODO
3272 jmp 7f
3273
32746:
3275 // offset==7
3276 // TODO
3277 jmp 7f
3278
3279 // end
32807:
3281
3282#if MACRO_LEVEL>=1
3283 .endm
3284#else
3285 ret
3286
3287#if defined(OS_LINUX)
3288 .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
3289#endif
3290#endif
3291
3292
3293
3294
3295
3296// common inner routine with file scope
3297//
3298// store n
3299//
3300// input arguments:
3301// r10 <- D
3302// ymm0 <- []
3303// ymm1 <- []
3304// ymm2 <- []
3305// ymm3 <- []
3306//
3307// output arguments:
3308// r10 <- D
3309// ymm0 <- []
3310// ymm1 <- []
3311// ymm2 <- []
3312// ymm3 <- []
3313
3314#if MACRO_LEVEL>=1
3315 .macro INNER_STORE_L_8X8_LIB8
3316#else
3317 .p2align 4,,15
3318#if defined(OS_LINUX)
3319 .type inner_store_l_8x8_lib8, @function
3320inner_store_l_8x8_lib8:
3321#elif defined(OS_MAC)
3322_inner_store_l_8x8_lib8:
3323#elif defined(OS_WINDOWS)
3324 .def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
3325inner_store_l_8x8_lib8:
3326#endif
3327#endif
3328
3329 vmovaps %ymm0, 0(%r10)
3330 vmovaps 32(%r10), %ymm14
3331 vblendps $0x01, %ymm14, %ymm1, %ymm1
3332 vmovaps %ymm1, 32(%r10)
3333 vmovaps 64(%r10), %ymm14
3334 vblendps $0x03, %ymm14, %ymm2, %ymm2
3335 vmovaps %ymm2, 64(%r10)
3336 vmovaps 96(%r10), %ymm14
3337 vblendps $0x07, %ymm14, %ymm3, %ymm3
3338 vmovaps %ymm3, 96(%r10)
3339 vmovaps 128(%r10), %ymm14
3340 vblendps $0x0f, %ymm14, %ymm4, %ymm4
3341 vmovaps %ymm4, 128(%r10)
3342 vmovaps 160(%r10), %ymm14
3343 vblendps $0x1f, %ymm14, %ymm5, %ymm5
3344 vmovaps %ymm5, 160(%r10)
3345 vmovaps 192(%r10), %ymm14
3346 vblendps $0x3f, %ymm14, %ymm6, %ymm6
3347 vmovaps %ymm6, 192(%r10)
3348 vmovaps 224(%r10), %ymm14
3349 vblendps $0x7f, %ymm14, %ymm7, %ymm7
3350 vmovaps %ymm7, 224(%r10)
3351
3352#if MACRO_LEVEL>=1
3353 .endm
3354#else
3355 ret
3356
3357#if defined(OS_LINUX)
3358 .size inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
3359#endif
3360#endif
3361
3362
3363
3364
3365
3366// common inner routine with file scope
3367//
3368// store lower vs
3369//
3370// input arguments:
3371// r10 <- D
3372// r11 <- km
3373// r12 <- kn
3374// ymm0 <- []
3375// ymm1 <- []
3376// ymm2 <- []
3377// ymm3 <- []
3378// ymm4 <- []
3379// ymm5 <- []
3380// ymm6 <- []
3381// ymm7 <- []
3382//
3383// output arguments:
3384// r10 <- D
3385// r11 <- km
3386// r12 <- kn
3387// ymm0 <- []
3388// ymm1 <- []
3389// ymm2 <- []
3390// ymm3 <- []
3391// ymm4 <- []
3392// ymm5 <- []
3393// ymm6 <- []
3394// ymm7 <- []
3395
3396#if MACRO_LEVEL>=1
3397 .macro INNER_STORE_L_8X8_VS_LIB8
3398#else
3399 .p2align 4,,15
3400#if defined(OS_LINUX)
3401 .type inner_store_l_8x8_vs_lib8, @function
3402inner_store_l_8x8_vs_lib8:
3403#elif defined(OS_MAC)
3404_inner_store_l_8x8_vs_lib8:
3405#elif defined(OS_WINDOWS)
3406 .def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
3407inner_store_l_8x8_vs_lib8:
3408#endif
3409#endif
3410
3411 // compute mask for rows
3412 vcvtsi2ss %r11d, %xmm15, %xmm15
3413#if defined(OS_LINUX) | defined(OS_WINDOWS)
3414 vmovups .LC00(%rip), %ymm12
3415#elif defined(OS_MAC)
3416 vmovups LC00(%rip), %ymm12
3417#endif
3418 vshufps $0x00, %xmm15, %xmm15, %xmm15
3419 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
3420 vsubps %ymm15, %ymm12, %ymm15
3421
3422 // offset==0
3423 vmaskmovps %ymm0, %ymm15, 0(%r10)
3424 vmovaps 32(%r10), %ymm12
3425 vblendps $0x01, %ymm12, %ymm1, %ymm1
3426 vmaskmovps %ymm1, %ymm15, 32(%r10)
3427 vmovaps 64(%r10), %ymm12
3428 vblendps $0x03, %ymm12, %ymm2, %ymm2
3429 vmaskmovps %ymm2, %ymm15, 64(%r10)
3430 vmovaps 96(%r10), %ymm12
3431 vblendps $0x07, %ymm12, %ymm3, %ymm3
3432 vmaskmovps %ymm3, %ymm15, 96(%r10)
3433 vmovaps 128(%r10), %ymm12
3434 vblendps $0x0f, %ymm12, %ymm4, %ymm4
3435 vmaskmovps %ymm4, %ymm15, 128(%r10)
3436 cmpl $6, %r12d
3437 jl 0f // end
3438 vmovaps 160(%r10), %ymm12
3439 vblendps $0x1f, %ymm12, %ymm5, %ymm5
3440 vmaskmovps %ymm5, %ymm15, 160(%r10)
3441 cmpl $7, %r12d
3442 jl 0f // end
3443 vmovaps 192(%r10), %ymm12
3444 vblendps $0x3f, %ymm12, %ymm6, %ymm6
3445 vmaskmovps %ymm6, %ymm15, 192(%r10)
3446 je 0f // end
3447 vmovaps 224(%r10), %ymm12
3448 vblendps $0x7f, %ymm12, %ymm7, %ymm7
3449 vmaskmovps %ymm7, %ymm15, 224(%r10)
3450
34510:
3452
3453#if MACRO_LEVEL>=1
3454 .endm
3455#else
3456 ret
3457
3458#if defined(OS_LINUX)
3459 .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
3460#endif
3461#endif
3462
3463
3464
3465
3466
3467// common inner routine with file scope
3468//
3469// store lower generalized
3470//
3471// input arguments:
3472// r10 <- offset
3473// r11 <- D
3474// r12 <- 4*sdd*sizeof(double)
3475// r13 <- m0 // row index: start from (inc)
3476// r14 <- m1 // row index: up to (exc)
3477// r15 <- n0 // col index: start from (inc)
3478// rax <- n1 // col index: up to (exc)
3479// rbx <- dirty
3480// ymm0 <- []
3481// ymm1 <- []
3482// ymm2 <- []
3483// ymm3 <- []
3484// ymm4 <- []
3485// ymm5 <- []
3486// ymm6 <- []
3487// ymm7 <- []
3488//
3489// output arguments:
3490// r10 <- offset
3491// r11 <- D
3492// r12 <- 4*sdd*sizeof(double)
3493// r13 <- m0 // row index: start from (inc)
3494// r14 <- m1 // row index: up to (exc)
3495// r15 <- n1-n0
3496// rax <- n1-n0
3497// rbx <- dirty
3498// ymm0 <- []
3499// ymm1 <- []
3500// ymm2 <- []
3501// ymm3 <- []
3502// ymm4 <- []
3503// ymm5 <- []
3504// ymm6 <- []
3505// ymm7 <- []
3506
3507#if MACRO_LEVEL>=1
3508 .macro INNER_STORE_L_8X8_GEN_LIB8
3509#else
3510 .p2align 4,,15
3511#if defined(OS_LINUX)
3512 .type inner_store_l_8x8_gen_lib8, @function
3513inner_store_l_8x8_gen_lib8:
3514#elif defined(OS_MAC)
3515_inner_store_l_8x8_gen_lib8:
3516#elif defined(OS_WINDOWS)
3517 .def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
3518inner_store_l_8x8_gen_lib8:
3519#endif
3520#endif
3521
3522 // compute mask for rows
3523 vcvtsi2ss %r13d, %xmm14, %xmm14
3524 vcvtsi2ss %r14d, %xmm15, %xmm15
3525#if defined(OS_LINUX) | defined(OS_WINDOWS)
3526 vmovups .LC00(%rip), %ymm12
3527#elif defined(OS_MAC)
3528 vmovups LC00(%rip), %ymm12
3529#endif
3530 vshufps $0x00, %xmm14, %xmm14, %xmm14
3531 vshufps $0x00, %xmm15, %xmm15, %xmm15
3532 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
3533 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
3534 vsubps %ymm12, %ymm14, %ymm14
3535 vsubps %ymm15, %ymm12, %ymm15
3536 vandps %ymm14, %ymm15, %ymm15
3537
3538 // shift D and sol for cols
3539 cmpl $0, %r15d
3540 jle 0f
3541
3542 vmovaps %ymm1, %ymm0
3543 vmovaps %ymm2, %ymm1
3544 vmovaps %ymm3, %ymm2
3545 vmovaps %ymm4, %ymm3
3546 vmovaps %ymm5, %ymm4
3547 vmovaps %ymm6, %ymm5
3548 vmovaps %ymm7, %ymm6
3549 addq $32, %r11
3550
3551 cmpl $1, %r15d
3552 jle 0f
3553
3554 vmovaps %ymm1, %ymm0
3555 vmovaps %ymm2, %ymm1
3556 vmovaps %ymm3, %ymm2
3557 vmovaps %ymm4, %ymm3
3558 vmovaps %ymm5, %ymm4
3559 vmovaps %ymm6, %ymm5
3560 addq $32, %r11
3561
3562 cmpl $2, %r15d
3563 jle 0f
3564
3565 vmovaps %ymm1, %ymm0
3566 vmovaps %ymm3, %ymm2
3567 vmovaps %ymm4, %ymm3
3568 vmovaps %ymm5, %ymm4
3569 addq $32, %r11
3570
35710:
3572
3573 // compute number of cols
3574 cmpl $8, %eax
3575 jle 0f
3576 movl $8, %eax
35770:
3578 subl %r15d, %eax
3579 movl %eax, %r15d
3580
3581 cmpl $0, %r10d
3582 jg 0f
3583
3584 // offset==0
3585 vmaskmovps %ymm0, %ymm15, 0(%r11)
3586 vmovaps 32(%r11), %ymm12
3587 vblendps $0x01, %ymm12, %ymm1, %ymm1
3588 vmaskmovps %ymm1, %ymm15, 32(%r11)
3589 vmovaps 64(%r11), %ymm12
3590 vblendps $0x03, %ymm12, %ymm2, %ymm2
3591 vmaskmovps %ymm2, %ymm15, 64(%r11)
3592 vmovaps 96(%r11), %ymm12
3593 vblendps $0x07, %ymm12, %ymm3, %ymm3
3594 vmaskmovps %ymm3, %ymm15, 96(%r11)
3595 vmovaps 128(%r11), %ymm12
3596 vblendps $0x0f, %ymm12, %ymm4, %ymm4
3597 vmaskmovps %ymm4, %ymm15, 128(%r11)
3598 cmpl $6, %r15d
3599 jl 7f // end
3600 vmovaps 160(%r11), %ymm12
3601 vblendps $0x1f, %ymm12, %ymm5, %ymm5
3602 vmaskmovps %ymm5, %ymm15, 160(%r11)
3603 cmpl $7, %r15d
3604 jl 7f // end
3605 vmovaps 192(%r11), %ymm12
3606 vblendps $0x3f, %ymm12, %ymm6, %ymm6
3607 vmaskmovps %ymm6, %ymm15, 192(%r11)
3608 je 7f // end
3609 vmovaps 224(%r11), %ymm12
3610 vblendps $0x7f, %ymm12, %ymm7, %ymm7
3611 vmaskmovps %ymm7, %ymm15, 224(%r11)
3612 //
3613 jmp 7f
3614
36150:
3616 // offset > 0
3617 // 1 2 3 4 5 6 7
3618
3619 movq %r11, %rbx // D0
3620 addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
3621
3622 cmpl $4, %r10d
3623 jl 1f
3624 jg 2f
3625
3626 // offset==4
3627 // TODO
3628 jmp 7f
3629
36301:
3631 // 1 2 3
3632
3633 cmpl $2, %r10d
3634 jl 3f
3635 jg 4f
3636
3637 // offset==2
3638 // TODO
3639 jmp 7f
3640
36413:
3642 // offset==1
3643 // TODO
3644 jmp 7f
3645
36464:
3647 // offset==3
3648 // TODO
3649 jmp 7f
3650
36512:
3652 // 5 6 7
3653
3654 cmpl $6, %r10d
3655 jl 5f
3656 jg 6f
3657
3658 // offset==6
3659 // TODO
3660 jmp 7f
3661
36625:
3663 // offset==5
3664 // TODO
3665 jmp 7f
3666
36676:
3668 // offset==7
3669 // TODO
3670 jmp 7f
3671
3672 // end
36737:
3674
3675#if MACRO_LEVEL>=1
3676 .endm
3677#else
3678 ret
3679
3680#if defined(OS_LINUX)
3681 .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
3682#endif
3683#endif
3684
3685
3686
3687
3688
3689// rdi rsi rdx rcx r8 r9 rsp+8
3690// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
3691
3692 .p2align 4,,15
3693#if defined(OS_LINUX)
3694 .globl kernel_sgemm_nt_8x8_lib8
3695 .type kernel_sgemm_nt_8x8_lib8, @function
3696kernel_sgemm_nt_8x8_lib8:
3697#elif defined(OS_MAC)
3698 .globl _kernel_sgemm_nt_8x8_lib8
3699_kernel_sgemm_nt_8x8_lib8:
3700#elif defined(OS_WINDOWS)
3701 .globl kernel_sgemm_nt_8x8_lib8
3702 .def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
3703kernel_sgemm_nt_8x8_lib8:
3704#endif
3705
3706 PROLOGUE
3707
3708 // zero accumulation registers
3709
3710 vxorpd %ymm0, %ymm0, %ymm0
3711 vmovapd %ymm0, %ymm1
3712 vmovapd %ymm0, %ymm2
3713 vmovapd %ymm0, %ymm3
3714 vmovapd %ymm0, %ymm4
3715 vmovapd %ymm0, %ymm5
3716 vmovapd %ymm0, %ymm6
3717 vmovapd %ymm0, %ymm7
3718
3719
3720 // call inner dgemm kernel nt
3721
3722 movq ARG1, %r10 // k
3723 movq ARG3, %r11 // A
3724 movq ARG4, %r12 // B
3725
3726#if MACRO_LEVEL>=2
3727 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3728#else
3729#if defined(OS_LINUX) | defined(OS_WINDOWS)
3730 call inner_kernel_gemm_add_nt_8x8_lib8
3731#elif defined(OS_MAC)
3732 callq _inner_kernel_gemm_add_nt_8x8_lib8
3733#endif
3734#endif
3735
3736
3737 // call inner scale
3738
3739 movq ARG2, %r10 // alpha
3740 movq ARG5, %r11 // beta
3741 movq ARG6, %r12 // C
3742
3743#if MACRO_LEVEL>=1
3744 INNER_BLEND_SCALE_AB_8X8_LIB8
3745#else
3746#if defined(OS_LINUX) | defined(OS_WINDOWS)
3747 call inner_blend_scale_ab_8x8_lib8
3748#elif defined(OS_MAC)
3749 callq _inner_blend_scale_ab_8x8_lib8
3750#endif
3751#endif
3752
3753
3754 // store n
3755
3756 movq ARG7, %r10 // D
3757
3758#if MACRO_LEVEL>=1
3759 INNER_STORE_8X8_LIB8
3760#else
3761#if defined(OS_LINUX) | defined(OS_WINDOWS)
3762 call inner_store_8x8_lib8
3763#elif defined(OS_MAC)
3764 callq _inner_store_8x8_lib8
3765#endif
3766#endif
3767
3768
3769 EPILOGUE
3770
3771 ret
3772
3773#if defined(OS_LINUX)
3774 .size kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
3775#endif
3776
3777
3778
3779
3780
3781// 1 2 3 4 5 6 7 8 9
3782// void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
3783
3784 .p2align 4,,15
3785#if defined(OS_LINUX)
3786 .globl kernel_sgemm_nt_8x8_vs_lib8
3787 .type kernel_sgemm_nt_8x8_vs_lib8, @function
3788kernel_sgemm_nt_8x8_vs_lib8:
3789#elif defined(OS_MAC)
3790 .globl _kernel_sgemm_nt_8x8_vs_lib8
3791_kernel_sgemm_nt_8x8_vs_lib8:
3792#elif defined(OS_WINDOWS)
3793 .globl kernel_sgemm_nt_8x8_vs_lib8
3794 .def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
3795kernel_sgemm_nt_8x8_vs_lib8:
3796#endif
3797
3798 PROLOGUE
3799
3800 // zero accumulation registers
3801
3802 vxorpd %ymm0, %ymm0, %ymm0
3803 vmovapd %ymm0, %ymm1
3804 vmovapd %ymm0, %ymm2
3805 vmovapd %ymm0, %ymm3
3806 vmovapd %ymm0, %ymm4
3807 vmovapd %ymm0, %ymm5
3808 vmovapd %ymm0, %ymm6
3809 vmovapd %ymm0, %ymm7
3810
3811
3812 // call inner dgemm kernel nt
3813
3814 movq ARG1, %r10 // k
3815 movq ARG3, %r11 // A
3816 movq ARG4, %r12 // B
3817
3818#if MACRO_LEVEL>=2
3819 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3820#else
3821#if defined(OS_LINUX) | defined(OS_WINDOWS)
3822 call inner_kernel_gemm_add_nt_8x8_lib8
3823#elif defined(OS_MAC)
3824 callq _inner_kernel_gemm_add_nt_8x8_lib8
3825#endif
3826#endif
3827
3828
3829 // call inner scale
3830
3831 movq ARG2, %r10 // alpha
3832 movq ARG5, %r11 // beta
3833 movq ARG6, %r12 // C
3834
3835#if MACRO_LEVEL>=1
3836 INNER_BLEND_SCALE_AB_8X8_LIB8
3837#else
3838#if defined(OS_LINUX) | defined(OS_WINDOWS)
3839 call inner_blend_scale_ab_8x8_lib8
3840#elif defined(OS_MAC)
3841 callq _inner_blend_scale_ab_8x8_lib8
3842#endif
3843#endif
3844
3845
3846 // store n
3847
3848 movq ARG7, %r10 // D
3849 movq ARG8, %r11 // D
3850 movq ARG9, %r12 // D
3851
3852#if MACRO_LEVEL>=1
3853 INNER_STORE_8X8_VS_LIB8
3854#else
3855#if defined(OS_LINUX) | defined(OS_WINDOWS)
3856 call inner_store_8x8_vs_lib8
3857#elif defined(OS_MAC)
3858 callq _inner_store_8x8_vs_lib8
3859#endif
3860#endif
3861
3862
3863 EPILOGUE
3864
3865 ret
3866
3867#if defined(OS_LINUX)
3868 .size kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
3869#endif
3870
3871
3872
3873
3874
3875// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
3876// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
3877
3878 .p2align 4,,15
3879#if defined(OS_LINUX)
3880 .globl kernel_sgemm_nt_8x8_gen_lib8
3881 .type kernel_sgemm_nt_8x8_gen_lib8, @function
3882kernel_sgemm_nt_8x8_gen_lib8:
3883#elif defined(OS_MAC)
3884 .globl _kernel_sgemm_nt_8x8_gen_lib8
3885_kernel_sgemm_nt_8x8_gen_lib8:
3886#elif defined(OS_WINDOWS)
3887 .globl kernel_sgemm_nt_8x8_gen_lib8
3888 .def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
3889kernel_sgemm_nt_8x8_gen_lib8:
3890#endif
3891
3892 PROLOGUE
3893
3894 // zero accumulation registers
3895
3896 vxorpd %ymm0, %ymm0, %ymm0
3897 vmovapd %ymm0, %ymm1
3898 vmovapd %ymm0, %ymm2
3899 vmovapd %ymm0, %ymm3
3900 vmovapd %ymm0, %ymm4
3901 vmovapd %ymm0, %ymm5
3902 vmovapd %ymm0, %ymm6
3903 vmovapd %ymm0, %ymm7
3904
3905
3906 // call inner dgemm kernel nt
3907
3908 movq ARG1, %r10 // k
3909 movq ARG3, %r11 // A
3910 movq ARG4, %r12 // B
3911
3912#if MACRO_LEVEL>=2
3913 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3914#else
3915#if defined(OS_LINUX) | defined(OS_WINDOWS)
3916 call inner_kernel_gemm_add_nt_8x8_lib8
3917#elif defined(OS_MAC)
3918 callq _inner_kernel_gemm_add_nt_8x8_lib8
3919#endif
3920#endif
3921
3922
3923 // call inner blend scale
3924
3925 movq ARG2, %r10 // alpha
3926 movq ARG5, %r11 // beta
3927 movq ARG6, %r12 // offsetC
3928 movq ARG7, %r13 // C
3929 movq ARG8, %r14 // sdc
3930 sall $5, %r14d // 8*sdc*sizeof(float)
3931
3932#if MACRO_LEVEL>=1
3933 INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
3934#else
3935#if defined(OS_LINUX) | defined(OS_WINDOWS)
3936 call inner_blend_scale_ab_8x8_gen_lib8
3937#elif defined(OS_MAC)
3938 callq _inner_blend_scale_ab_8x8_gen_lib8
3939#endif
3940#endif
3941
3942
3943 // store n gen
3944
3945 movq ARG9, %r10 // offsetD
3946 movq ARG10, %r11 // D
3947 movq ARG11, %r12 // sdd
3948 sall $5, %r12d // 8*sdb*sizeof(float)
3949 movq ARG12, %r13 // m0
3950 movq ARG13, %r14 // m1
3951 movq ARG14, %r15 // n0
3952 movq ARG15, %rax // n1
3953
3954#if MACRO_LEVEL>=1
3955 INNER_STORE_8X8_GEN_LIB8
3956#else
3957#if defined(OS_LINUX) | defined(OS_WINDOWS)
3958 call inner_store_8x8_gen_lib8
3959#elif defined(OS_MAC)
3960 callq _inner_store_8x8_gen_lib8
3961#endif
3962#endif
3963
3964
3965 EPILOGUE
3966
3967 ret
3968
3969#if defined(OS_LINUX)
3970 .size kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
3971#endif
3972
3973
3974
3975
3976
3977// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
3978// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
3979
3980 .p2align 4,,15
3981#if defined(OS_LINUX)
3982 .globl kernel_sgemm_nn_8x8_lib8
3983 .type kernel_sgemm_nn_8x8_lib8, @function
3984kernel_sgemm_nn_8x8_lib8:
3985#elif defined(OS_MAC)
3986 .globl _kernel_sgemm_nn_8x8_lib8
3987_kernel_sgemm_nn_8x8_lib8:
3988#elif defined(OS_WINDOWS)
3989 .globl kernel_sgemm_nn_8x8_lib8
3990 .def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
3991kernel_sgemm_nn_8x8_lib8:
3992#endif
3993
3994 PROLOGUE
3995
3996 // zero accumulation registers
3997
3998 vxorps %ymm0, %ymm0, %ymm0
3999 vmovaps %ymm0, %ymm1
4000 vmovaps %ymm0, %ymm2
4001 vmovaps %ymm0, %ymm3
4002 vmovaps %ymm0, %ymm4
4003 vmovaps %ymm0, %ymm5
4004 vmovaps %ymm0, %ymm6
4005 vmovaps %ymm0, %ymm7
4006
4007
4008 // call inner dgemm kernel nn
4009
4010 movq ARG1, %r10 // k
4011 movq ARG3, %r11 // A
4012 movq ARG5, %r12 // B
4013 movq ARG6, %r13 // sdb
4014 sall $5, %r13d // 4*sdb*sizeof(double)
4015 movq ARG4, %r14 // offsetB
4016
4017#if MACRO_LEVEL>=1
4018 INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
4019#else
4020#if defined(OS_LINUX) | defined(OS_WINDOWS)
4021 call inner_edge_gemm_add_nn_8x8_lib8
4022#elif defined(OS_MAC)
4023 callq _inner_edge_gemm_add_nn_8x8_lib8
4024#endif
4025#endif
4026
4027#if MACRO_LEVEL>=2
4028 INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
4029#else
4030#if defined(OS_LINUX) | defined(OS_WINDOWS)
4031 call inner_kernel_gemm_add_nn_8x8_lib8
4032#elif defined(OS_MAC)
4033 callq _inner_kernel_gemm_add_nn_8x8_lib8
4034#endif
4035#endif
4036
4037
4038 // call inner blend
4039
4040 movq ARG2, %r10 // alpha
4041 movq ARG7, %r11 // beta
4042 movq ARG8, %r12 // C
4043
4044#if MACRO_LEVEL>=1
4045 INNER_SCALE_AB_8X8_LIB8
4046#else
4047#if defined(OS_LINUX) | defined(OS_WINDOWS)
4048 call inner_scale_ab_8x8_lib8
4049#elif defined(OS_MAC)
4050 callq _inner_scale_ab_8x8_lib8
4051#endif
4052#endif
4053
4054
4055 // store n
4056
4057 movq ARG9, %r10 // D
4058
4059#if MACRO_LEVEL>=1
4060 INNER_STORE_8X8_LIB8
4061#else
4062#if defined(OS_LINUX) | defined(OS_WINDOWS)
4063 call inner_store_8x8_lib8
4064#elif defined(OS_MAC)
4065 callq _inner_store_8x8_lib8
4066#endif
4067#endif
4068
4069
4070 EPILOGUE
4071
4072 ret
4073
4074#if defined(OS_LINUX)
4075 .size kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
4076#endif
4077
4078
4079
4080
4081
4082// 1 2 3 4 5 6 7 8 9 10 11
4083// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
4084
4085 .p2align 4,,15
4086#if defined(OS_LINUX)
4087 .globl kernel_sgemm_nn_8x8_vs_lib8
4088 .type kernel_sgemm_nn_8x8_vs_lib8, @function
4089kernel_sgemm_nn_8x8_vs_lib8:
4090#elif defined(OS_MAC)
4091 .globl _kernel_sgemm_nn_8x8_vs_lib8
4092_kernel_sgemm_nn_8x8_vs_lib8:
4093#elif defined(OS_WINDOWS)
4094 .globl kernel_sgemm_nn_8x8_vs_lib8
4095 .def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
4096kernel_sgemm_nn_8x8_vs_lib8:
4097#endif
4098
4099 PROLOGUE
4100
4101 // zero accumulation registers
4102
4103 vxorps %ymm0, %ymm0, %ymm0
4104 vmovaps %ymm0, %ymm1
4105 vmovaps %ymm0, %ymm2
4106 vmovaps %ymm0, %ymm3
4107 vmovaps %ymm0, %ymm4
4108 vmovaps %ymm0, %ymm5
4109 vmovaps %ymm0, %ymm6
4110 vmovaps %ymm0, %ymm7
4111
4112
4113 // call inner dgemm kernel nn
4114
4115 movq ARG1, %r10 // k
4116 movq ARG3, %r11 // A
4117 movq ARG5, %r12 // B
4118 movq ARG6, %r13 // sdb
4119 sall $5, %r13d // 4*sdb*sizeof(double)
4120 movq ARG4, %r14 // offsetB
4121
4122#if MACRO_LEVEL>=1
4123 INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
4124#else
4125#if defined(OS_LINUX) | defined(OS_WINDOWS)
4126 call inner_edge_gemm_add_nn_8x8_lib8
4127#elif defined(OS_MAC)
4128 callq _inner_edge_gemm_add_nn_8x8_lib8
4129#endif
4130#endif
4131
4132#if MACRO_LEVEL>=2
4133 INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
4134#else
4135#if defined(OS_LINUX) | defined(OS_WINDOWS)
4136 call inner_kernel_gemm_add_nn_8x8_lib8
4137#elif defined(OS_MAC)
4138 callq _inner_kernel_gemm_add_nn_8x8_lib8
4139#endif
4140#endif
4141
4142
4143 // call inner blend
4144
4145 movq ARG2, %r10 // alpha
4146 movq ARG7, %r11 // beta
4147 movq ARG8, %r12 // C
4148
4149#if MACRO_LEVEL>=1
4150 INNER_SCALE_AB_8X8_LIB8
4151#else
4152#if defined(OS_LINUX) | defined(OS_WINDOWS)
4153 call inner_scale_ab_8x8_lib8
4154#elif defined(OS_MAC)
4155 callq _inner_scale_ab_8x8_lib8
4156#endif
4157#endif
4158
4159
4160 // store n
4161
4162 movq ARG9, %r10 // D
4163 movq ARG10, %r11 // D
4164 movq ARG11, %r12 // D
4165
4166#if MACRO_LEVEL>=1
4167 INNER_STORE_8X8_VS_LIB8
4168#else
4169#if defined(OS_LINUX) | defined(OS_WINDOWS)
4170 call inner_store_8x8_vs_lib8
4171#elif defined(OS_MAC)
4172 callq _inner_store_8x8_vs_lib8
4173#endif
4174#endif
4175
4176
4177 EPILOGUE
4178
4179 ret
4180
4181#if defined(OS_LINUX)
4182 .size kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
4183#endif
4184
4185
4186
4187
4188
4189// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
4190// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
4191
4192 .p2align 4,,15
4193#if defined(OS_LINUX)
4194 .globl kernel_sgemm_nn_8x8_gen_lib8
4195 .type kernel_sgemm_nn_8x8_gen_lib8, @function
4196kernel_sgemm_nn_8x8_gen_lib8:
4197#elif defined(OS_MAC)
4198 .globl _kernel_sgemm_nn_8x8_gen_lib8
4199_kernel_sgemm_nn_8x8_gen_lib8:
4200#elif defined(OS_WINDOWS)
4201 .globl kernel_sgemm_nn_8x8_gen_lib8
4202 .def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
4203kernel_sgemm_nn_8x8_gen_lib8:
4204#endif
4205
4206 PROLOGUE
4207
4208 // zero accumulation registers
4209
4210 vxorps %ymm0, %ymm0, %ymm0
4211 vmovaps %ymm0, %ymm1
4212 vmovaps %ymm0, %ymm2
4213 vmovaps %ymm0, %ymm3
4214 vmovaps %ymm0, %ymm4
4215 vmovaps %ymm0, %ymm5
4216 vmovaps %ymm0, %ymm6
4217 vmovaps %ymm0, %ymm7
4218
4219
4220 // call inner dgemm kernel nn
4221
4222 movq ARG1, %r10 // k
4223 movq ARG3, %r11 // A
4224 movq ARG5, %r12 // B
4225 movq ARG6, %r13 // sdb
4226 sall $5, %r13d // 4*sdb*sizeof(double)
4227 movq ARG4, %r14 // offsetB
4228
4229#if MACRO_LEVEL>=1
4230 INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
4231#else
4232#if defined(OS_LINUX) | defined(OS_WINDOWS)
4233 call inner_edge_gemm_add_nn_8x8_lib8
4234#elif defined(OS_MAC)
4235 callq _inner_edge_gemm_add_nn_8x8_lib8
4236#endif
4237#endif
4238
4239#if MACRO_LEVEL>=2
4240 INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
4241#else
4242#if defined(OS_LINUX) | defined(OS_WINDOWS)
4243 call inner_kernel_gemm_add_nn_8x8_lib8
4244#elif defined(OS_MAC)
4245 callq _inner_kernel_gemm_add_nn_8x8_lib8
4246#endif
4247#endif
4248
4249
4250 // call inner blend scale
4251
4252 movq ARG2, %r10 // alpha
4253 movq ARG7, %r11 // beta
4254 movq ARG8, %r12 // offsetC
4255 movq ARG9, %r13 // C
4256 movq ARG10, %r14 // sdc
4257 sall $5, %r14d // 4*sdc*sizeof(double)
4258
4259#if MACRO_LEVEL>=1
4260 INNER_SCALE_AB_8X8_GEN_LIB8
4261#else
4262#if defined(OS_LINUX) | defined(OS_WINDOWS)
4263 call inner_scale_ab_8x8_gen_lib8
4264#elif defined(OS_MAC)
4265 callq _inner_scale_ab_8x8_gen_lib8
4266#endif
4267#endif
4268
4269
4270 // store n gen
4271
4272 movq ARG11, %r10 // offsetD
4273 movq ARG12, %r11 // D
4274 movq ARG13, %r12 // sdd
4275 sall $5, %r12d // 4*sdb*sizeof(double)
4276 movq ARG14, %r13 // m0
4277 movq ARG15, %r14 // m1
4278 movq ARG16, %r15 // n0
4279 movq ARG17, %rax // n1
4280
4281#if MACRO_LEVEL>=1
4282 INNER_STORE_8X8_GEN_LIB8
4283#else
4284#if defined(OS_LINUX) | defined(OS_WINDOWS)
4285 call inner_store_8x8_gen_lib8
4286#elif defined(OS_MAC)
4287 callq _inner_store_8x8_gen_lib8
4288#endif
4289#endif
4290
4291
4292 EPILOGUE
4293
4294 ret
4295
4296#if defined(OS_LINUX)
4297 .size kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
4298#endif
4299
4300
4301
4302
4303
4304// rdi rsi rdx rcx r8 r9 rsp+8
4305// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
4306
4307 .p2align 4,,15
4308#if defined(OS_LINUX)
4309 .globl kernel_ssyrk_nt_l_8x8_lib8
4310 .type kernel_ssyrk_nt_l_8x8_lib8, @function
4311kernel_ssyrk_nt_l_8x8_lib8:
4312#elif defined(OS_MAC)
4313 .globl _kernel_ssyrk_nt_l_8x8_lib8
4314_kernel_ssyrk_nt_l_8x8_lib8:
4315#elif defined(OS_WINDOWS)
4316 .globl kernel_ssyrk_nt_l_8x8_lib8
4317 .def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
4318kernel_ssyrk_nt_l_8x8_lib8:
4319#endif
4320
4321 PROLOGUE
4322
4323 // zero accumulation registers
4324
4325 vxorpd %ymm0, %ymm0, %ymm0
4326 vmovapd %ymm0, %ymm1
4327 vmovapd %ymm0, %ymm2
4328 vmovapd %ymm0, %ymm3
4329 vmovapd %ymm0, %ymm4
4330 vmovapd %ymm0, %ymm5
4331 vmovapd %ymm0, %ymm6
4332 vmovapd %ymm0, %ymm7
4333
4334
4335 // call inner dgemm kernel nt
4336
4337 movq ARG1, %r10 // k
4338 movq ARG3, %r11 // A
4339 movq ARG4, %r12 // B
4340
4341#if MACRO_LEVEL>=2
4342 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4343#else
4344#if defined(OS_LINUX) | defined(OS_WINDOWS)
4345 call inner_kernel_gemm_add_nt_8x8_lib8
4346#elif defined(OS_MAC)
4347 callq _inner_kernel_gemm_add_nt_8x8_lib8
4348#endif
4349#endif
4350
4351
4352 // call inner scale
4353
4354 movq ARG2, %r10 // alpha
4355 movq ARG5, %r11 // beta
4356 movq ARG6, %r12 // C
4357
4358#if MACRO_LEVEL>=1
4359 INNER_BLEND_SCALE_AB_8X8_LIB8
4360#else
4361#if defined(OS_LINUX) | defined(OS_WINDOWS)
4362 call inner_blend_scale_ab_8x8_lib8
4363#elif defined(OS_MAC)
4364 callq _inner_blend_scale_ab_8x8_lib8
4365#endif
4366#endif
4367
4368
4369 // store n
4370
4371 movq ARG7, %r10 // D
4372
4373#if MACRO_LEVEL>=1
4374 INNER_STORE_L_8X8_LIB8
4375#else
4376#if defined(OS_LINUX) | defined(OS_WINDOWS)
4377 call inner_store_l_8x8_lib8
4378#elif defined(OS_MAC)
4379 callq _inner_store_l_8x8_lib8
4380#endif
4381#endif
4382
4383
4384 EPILOGUE
4385
4386 ret
4387
4388#if defined(OS_LINUX)
4389 .size kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
4390#endif
4391
4392
4393
4394
4395
4396// 1 2 3 4 5 6 7 8 9
4397// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
4398
4399 .p2align 4,,15
4400#if defined(OS_LINUX)
4401 .globl kernel_ssyrk_nt_l_8x8_vs_lib8
4402 .type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
4403kernel_ssyrk_nt_l_8x8_vs_lib8:
4404#elif defined(OS_MAC)
4405 .globl _kernel_ssyrk_nt_l_8x8_vs_lib8
4406_kernel_ssyrk_nt_l_8x8_vs_lib8:
4407#elif defined(OS_WINDOWS)
4408 .globl kernel_ssyrk_nt_l_8x8_vs_lib8
4409 .def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
4410kernel_ssyrk_nt_l_8x8_vs_lib8:
4411#endif
4412
4413 PROLOGUE
4414
4415 // zero accumulation registers
4416
4417 vxorpd %ymm0, %ymm0, %ymm0
4418 vmovapd %ymm0, %ymm1
4419 vmovapd %ymm0, %ymm2
4420 vmovapd %ymm0, %ymm3
4421 vmovapd %ymm0, %ymm4
4422 vmovapd %ymm0, %ymm5
4423 vmovapd %ymm0, %ymm6
4424 vmovapd %ymm0, %ymm7
4425
4426
4427 // call inner dgemm kernel nt
4428
4429 movq ARG1, %r10 // k
4430 movq ARG3, %r11 // A
4431 movq ARG4, %r12 // B
4432
4433#if MACRO_LEVEL>=2
4434 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4435#else
4436#if defined(OS_LINUX) | defined(OS_WINDOWS)
4437 call inner_kernel_gemm_add_nt_8x8_lib8
4438#elif defined(OS_MAC)
4439 callq _inner_kernel_gemm_add_nt_8x8_lib8
4440#endif
4441#endif
4442
4443
4444 // call inner scale
4445
4446 movq ARG2, %r10 // alpha
4447 movq ARG5, %r11 // beta
4448 movq ARG6, %r12 // C
4449
4450#if MACRO_LEVEL>=1
4451 INNER_BLEND_SCALE_AB_8X8_LIB8
4452#else
4453#if defined(OS_LINUX) | defined(OS_WINDOWS)
4454 call inner_blend_scale_ab_8x8_lib8
4455#elif defined(OS_MAC)
4456 callq _inner_blend_scale_ab_8x8_lib8
4457#endif
4458#endif
4459
4460
4461 // store n
4462
4463 movq ARG7, %r10 // D
4464 movq ARG8, %r11 // km
4465 movq ARG9, %r12 // kn
4466
4467#if MACRO_LEVEL>=1
4468 INNER_STORE_L_8X8_VS_LIB8
4469#else
4470#if defined(OS_LINUX) | defined(OS_WINDOWS)
4471 call inner_store_l_8x8_vs_lib8
4472#elif defined(OS_MAC)
4473 callq _inner_store_l_8x8_vs_lib8
4474#endif
4475#endif
4476
4477
4478 EPILOGUE
4479
4480 ret
4481
4482#if defined(OS_LINUX)
4483 .size kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
4484#endif
4485
4486
4487
4488
4489
4490// edi rsi rdx ecx r8 r9 rsp+8
4491// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
4492
4493 .p2align 4,,15
4494#if defined(OS_LINUX)
4495 .globl kernel_strsm_nt_rl_inv_8x8_lib8
4496 .type kernel_strsm_nt_rl_inv_8x8_lib8, @function
4497kernel_strsm_nt_rl_inv_8x8_lib8:
4498#elif defined(OS_MAC)
4499 .globl _kernel_strsm_nt_rl_inv_8x8_lib8
4500_kernel_strsm_nt_rl_inv_8x8_lib8:
4501#elif defined(OS_WINDOWS)
4502 .globl kernel_strsm_nt_rl_inv_8x8_lib8
4503 .def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
4504kernel_strsm_nt_rl_inv_8x8_lib8:
4505#endif
4506
4507 PROLOGUE
4508
4509 // zero accumulation registers
4510
4511 vxorpd %ymm0, %ymm0, %ymm0
4512 vmovapd %ymm0, %ymm1
4513 vmovapd %ymm0, %ymm2
4514 vmovapd %ymm0, %ymm3
4515 vmovapd %ymm0, %ymm4
4516 vmovapd %ymm0, %ymm5
4517 vmovapd %ymm0, %ymm6
4518 vmovapd %ymm0, %ymm7
4519
4520
4521 // call inner dgemm kernel nt
4522
4523 movq ARG1, %r10
4524 movq ARG2, %r11
4525 movq ARG3, %r12
4526
4527#if MACRO_LEVEL>=2
4528 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4529#else
4530#if defined(OS_LINUX) | defined(OS_WINDOWS)
4531 call inner_kernel_gemm_sub_nt_8x8_lib8
4532#elif defined(OS_MAC)
4533 callq _inner_kernel_gemm_sub_nt_8x8_lib8
4534#endif
4535#endif
4536
4537
4538 // call inner blender_loader nn
4539
4540 movq ARG4, %r10
4541
4542#if MACRO_LEVEL>=1
4543 INNER_BLEND_SCALE_11_8X8_LIB4
4544#else
4545#if defined(OS_LINUX) | defined(OS_WINDOWS)
4546 call inner_blend_scale_11_8x8_lib8
4547#elif defined(OS_MAC)
4548 callq _inner_blend_scale_11_8x8_lib8
4549#endif
4550#endif
4551
4552
4553 // solve
4554
4555 movq ARG6, %r10 // E
4556 movq ARG7, %r11 // inv_diag_E
4557 movl $8, %r12d // n1
4558
4559#if MACRO_LEVEL>=1
4560 INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
4561#else
4562#if defined(OS_LINUX) | defined(OS_WINDOWS)
4563 call inner_edge_trsm_rlt_inv_8x8_vs_lib8
4564#elif defined(OS_MAC)
4565 callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
4566#endif
4567#endif
4568
4569
4570 // store
4571
4572 movq ARG5, %r10 // D
4573
4574#if MACRO_LEVEL>=1
4575 INNER_STORE_8X8_LIB8
4576#else
4577#if defined(OS_LINUX) | defined(OS_WINDOWS)
4578 call inner_store_8x8_lib8
4579#elif defined(OS_MAC)
4580 callq _inner_store_8x8_lib8
4581#endif
4582#endif
4583
4584
4585 EPILOGUE
4586
4587 ret
4588
4589#if defined(OS_LINUX)
4590 .size kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
4591#endif
4592
4593
4594
4595
4596
4597// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
4598// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
4599
4600 .p2align 4,,15
4601#if defined(OS_LINUX)
4602 .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
4603 .type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
4604kernel_strsm_nt_rl_inv_8x8_vs_lib8:
4605#elif defined(OS_MAC)
4606 .globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
4607_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
4608#elif defined(OS_WINDOWS)
4609 .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
4610 .def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
4611kernel_strsm_nt_rl_inv_8x8_vs_lib8:
4612#endif
4613
4614 PROLOGUE
4615
4616 // zero accumulation registers
4617
4618 vxorpd %ymm0, %ymm0, %ymm0
4619 vmovapd %ymm0, %ymm1
4620 vmovapd %ymm0, %ymm2
4621 vmovapd %ymm0, %ymm3
4622 vmovapd %ymm0, %ymm4
4623 vmovapd %ymm0, %ymm5
4624 vmovapd %ymm0, %ymm6
4625 vmovapd %ymm0, %ymm7
4626
4627
4628 // call inner dgemm kernel nt
4629
4630 movq ARG1, %r10
4631 movq ARG2, %r11
4632 movq ARG3, %r12
4633
4634#if MACRO_LEVEL>=2
4635 INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
4636#else
4637#if defined(OS_LINUX) | defined(OS_WINDOWS)
4638 call inner_kernel_gemm_sub_nt_8x8_lib8
4639#elif defined(OS_MAC)
4640 callq _inner_kernel_gemm_sub_nt_8x8_lib8
4641#endif
4642#endif
4643
4644
4645 // call inner blender_loader nn // TODO scale gen
4646
4647 movq ARG4, %r10 // C
4648
4649#if MACRO_LEVEL>=1
4650 INNER_BLEND_SCALE_11_8X4_LIB8
4651#else
4652#if defined(OS_LINUX) | defined(OS_WINDOWS)
4653 call inner_blend_scale_11_8x8_lib8
4654#elif defined(OS_MAC)
4655 callq _inner_blend_scale_11_8x8_lib8
4656#endif
4657#endif
4658
4659
4660 // solve
4661
4662 movq ARG6, %r10 // E
4663 movq ARG7, %r11 // inv_diag_E
4664 movq ARG9, %r12 // kn
4665
4666#if MACRO_LEVEL>=1
4667 INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
4668#else
4669#if defined(OS_LINUX) | defined(OS_WINDOWS)
4670 call inner_edge_trsm_rlt_inv_8x8_vs_lib8
4671#elif defined(OS_MAC)
4672 callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
4673#endif
4674#endif
4675
4676
4677 // store
4678
4679 movq ARG5, %r10 // D
4680 movq ARG8, %r11 // m1
4681 movq ARG9, %r12 // n1
4682
4683#if MACRO_LEVEL>=1
4684 INNER_STORE_8X4_VS_LIB8
4685#else
4686#if defined(OS_LINUX) | defined(OS_WINDOWS)
4687 call inner_store_8x8_vs_lib8
4688#elif defined(OS_MAC)
4689 callq _inner_store_8x8_vs_lib8
4690#endif
4691#endif
4692
4693
4694 EPILOGUE
4695
4696 ret
4697
4698#if defined(OS_LINUX)
4699 .size kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
4700#endif
4701
4702
4703
4704
4705
4706// 1 2 3 4 5 6 7 8 9 10
4707// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
4708
4709 .p2align 4,,15
4710#if defined(OS_LINUX)
4711 .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
4712 .type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
4713kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
4714#elif defined(OS_MAC)
4715 .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
4716_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
4717#elif defined(OS_WINDOWS)
4718 .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
4719 .def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
4720kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
4721#endif
4722
4723 PROLOGUE
4724
4725 // zero accumulation registers
4726
4727 vxorpd %ymm0, %ymm0, %ymm0
4728 vmovaps %ymm0, %ymm1
4729 vmovaps %ymm0, %ymm2
4730 vmovaps %ymm0, %ymm3
4731 vmovaps %ymm0, %ymm4
4732 vmovaps %ymm0, %ymm5
4733 vmovaps %ymm0, %ymm6
4734 vmovaps %ymm0, %ymm7
4735
4736
4737 // call inner dgemm kernel nt add
4738
4739 movq ARG1, %r10 // kp
4740 movq ARG2, %r11 // Ap
4741 movq ARG3, %r12 // Bp
4742
4743#if MACRO_LEVEL>=2
4744 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4745#else
4746#if defined(OS_LINUX) | defined(OS_WINDOWS)
4747 call inner_kernel_gemm_add_nt_8x8_lib8
4748#elif defined(OS_MAC)
4749 callq _inner_kernel_gemm_add_nt_8x8_lib8
4750#endif
4751#endif
4752
4753
4754 // call inner dgemm kernel nt sub
4755
4756 movq ARG4, %r10 // km
4757 movq ARG5, %r11 // Am
4758 movq ARG6, %r12 // Bm
4759
4760#if MACRO_LEVEL>=2
4761 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4762#else
4763#if defined(OS_LINUX) | defined(OS_WINDOWS)
4764 call inner_kernel_gemm_sub_nt_8x8_lib8
4765#elif defined(OS_MAC)
4766 callq _inner_kernel_gemm_sub_nt_8x8_lib8
4767#endif
4768#endif
4769
4770
4771 // call inner blender_loader nn
4772
4773 movq ARG7, %r10 // C
4774
4775#if MACRO_LEVEL>=1
4776 INNER_BLEND_SCALE_11_8X8_LIB8
4777#else
4778#if defined(OS_LINUX) | defined(OS_WINDOWS)
4779 call inner_blend_scale_11_8x8_lib8
4780#elif defined(OS_MAC)
4781 callq _inner_blend_scale_11_8x8_lib8
4782#endif
4783#endif
4784
4785
4786 // solve
4787
4788 movq ARG9, %r10 // E
4789 movq ARG10, %r11 // inv_diag_E
4790 movq $8, %r12 // n1
4791
4792#if MACRO_LEVEL>=1
4793 INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
4794#else
4795#if defined(OS_LINUX) | defined(OS_WINDOWS)
4796 call inner_edge_trsm_rlt_inv_8x8_vs_lib8
4797#elif defined(OS_MAC)
4798 callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
4799#endif
4800#endif
4801
4802
4803 // store
4804
4805 movq ARG8, %r10 // D
4806
4807#if MACRO_LEVEL>=1
4808 INNER_STORE_8X8_LIB8
4809#else
4810#if defined(OS_LINUX) | defined(OS_WINDOWS)
4811 call inner_store_8x8_lib8
4812#elif defined(OS_MAC)
4813 callq _inner_store_8x8_lib8
4814#endif
4815#endif
4816
4817
4818 EPILOGUE
4819
4820 ret
4821
4822#if defined(OS_LINUX)
4823 .size kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
4824#endif
4825
4826
4827
4828
4829
4830// 1 2 3 4 5 6 7 8 9 10 11 12
4831// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
4832
4833 .p2align 4,,15
4834#if defined(OS_LINUX)
4835 .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
4836 .type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
4837kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
4838#elif defined(OS_MAC)
4839 .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
4840_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
4841#elif defined(OS_WINDOWS)
4842 .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
4843 .def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
4844kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
4845#endif
4846
4847 PROLOGUE
4848
4849 // zero accumulation registers
4850
4851 vxorpd %ymm0, %ymm0, %ymm0
4852 vmovaps %ymm0, %ymm1
4853 vmovaps %ymm0, %ymm2
4854 vmovaps %ymm0, %ymm3
4855 vmovaps %ymm0, %ymm4
4856 vmovaps %ymm0, %ymm5
4857 vmovaps %ymm0, %ymm6
4858 vmovaps %ymm0, %ymm7
4859
4860
4861 // call inner dgemm kernel nt add
4862
4863 movq ARG1, %r10 // kp
4864 movq ARG2, %r11 // Ap
4865 movq ARG3, %r12 // Bp
4866
4867#if MACRO_LEVEL>=2
4868 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4869#else
4870#if defined(OS_LINUX) | defined(OS_WINDOWS)
4871 call inner_kernel_gemm_add_nt_8x8_lib8
4872#elif defined(OS_MAC)
4873 callq _inner_kernel_gemm_add_nt_8x8_lib8
4874#endif
4875#endif
4876
4877
4878 // call inner dgemm kernel nt sub
4879
4880 movq ARG4, %r10 // km
4881 movq ARG5, %r11 // Am
4882 movq ARG6, %r12 // Bm
4883
4884#if MACRO_LEVEL>=2
4885 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4886#else
4887#if defined(OS_LINUX) | defined(OS_WINDOWS)
4888 call inner_kernel_gemm_sub_nt_8x8_lib8
4889#elif defined(OS_MAC)
4890 callq _inner_kernel_gemm_sub_nt_8x8_lib8
4891#endif
4892#endif
4893
4894
4895 // call inner blender_loader nn
4896
4897 movq ARG7, %r10 // C
4898
4899#if MACRO_LEVEL>=1
4900 INNER_BLEND_SCALE_11_8X8_LIB8
4901#else
4902#if defined(OS_LINUX) | defined(OS_WINDOWS)
4903 call inner_blend_scale_11_8x8_lib8
4904#elif defined(OS_MAC)
4905 callq _inner_blend_scale_11_8x8_lib8
4906#endif
4907#endif
4908
4909
4910 // solve
4911
4912 movq ARG9, %r10 // E
4913 movq ARG10, %r11 // inv_diag_E
4914 movq ARG12, %r12 // kn
4915
4916#if MACRO_LEVEL>=1
4917 INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
4918#else
4919#if defined(OS_LINUX) | defined(OS_WINDOWS)
4920 call inner_edge_trsm_rlt_inv_8x8_vs_lib8
4921#elif defined(OS_MAC)
4922 callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
4923#endif
4924#endif
4925
4926
4927 // store
4928
4929 movq ARG8, %r10 // D
4930 movq ARG11, %r11 // km
4931 movq ARG12, %r12 // kn
4932
4933#if MACRO_LEVEL>=1
4934 INNER_STORE_8X8_VS_LIB8
4935#else
4936#if defined(OS_LINUX) | defined(OS_WINDOWS)
4937 call inner_store_8x8_vs_lib8
4938#elif defined(OS_MAC)
4939 callq _inner_store_8x8_vs_lib8
4940#endif
4941#endif
4942
4943
4944 EPILOGUE
4945
4946 ret
4947
4948#if defined(OS_LINUX)
4949 .size kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
4950#endif
4951
4952
4953
4954
4955
4956// edi rsi rdx rcx r8 r9
4957// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
4958
4959 .p2align 4,,15
4960#if defined(OS_LINUX)
4961 .globl kernel_spotrf_nt_l_8x8_lib8
4962 .type kernel_spotrf_nt_l_8x8_lib8, @function
4963kernel_spotrf_nt_l_8x8_lib8:
4964#elif defined(OS_MAC)
4965 .globl _kernel_spotrf_nt_l_8x8_lib8
4966_kernel_spotrf_nt_l_8x8_lib8:
4967#elif defined(OS_WINDOWS)
4968 .globl kernel_spotrf_nt_l_8x8_lib8
4969 .def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
4970kernel_spotrf_nt_l_8x8_lib8:
4971#endif
4972
4973 PROLOGUE
4974
4975 // zero accumulation registers
4976
4977 vxorpd %ymm0, %ymm0, %ymm0
4978 vmovapd %ymm0, %ymm1
4979 vmovapd %ymm0, %ymm2
4980 vmovapd %ymm0, %ymm3
4981 vmovapd %ymm0, %ymm4
4982 vmovapd %ymm0, %ymm5
4983 vmovapd %ymm0, %ymm6
4984 vmovapd %ymm0, %ymm7
4985
4986
4987 // call inner dgemm kernel nt
4988
4989 movq ARG1, %r10
4990 movq ARG2, %r11
4991 movq ARG3, %r12
4992
4993#if MACRO_LEVEL>=2
4994 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4995#else
4996#if defined(OS_LINUX) | defined(OS_WINDOWS)
4997 call inner_kernel_gemm_sub_nt_8x8_lib8
4998#elif defined(OS_MAC)
4999 callq _inner_kernel_gemm_sub_nt_8x8_lib8
5000#endif
5001#endif
5002
5003
5004 // call inner blender_loader nn
5005
5006 movq ARG4, %r10 // C
5007
5008#if MACRO_LEVEL>=1
5009 INNER_BLEND_SCALE_11_8X8_LIB8
5010#else
5011#if defined(OS_LINUX) | defined(OS_WINDOWS)
5012 call inner_blend_scale_11_8x8_lib8
5013#elif defined(OS_MAC)
5014 callq _inner_blend_scale_11_8x8_lib8
5015#endif
5016#endif
5017
5018
5019 // factorization
5020
5021 movq ARG6, %r10 // inv_diag_D
5022 movl $8, %r11d // n1
5023
5024#if MACRO_LEVEL>=1
5025 INNER_EDGE_POTRF_8X8_VS_LIB8
5026#else
5027#if defined(OS_LINUX) | defined(OS_WINDOWS)
5028 call inner_edge_potrf_8x8_vs_lib8
5029#elif defined(OS_MAC)
5030 callq _inner_edge_potrf_8x8_vs_lib8
5031#endif
5032#endif
5033
5034
5035 // store
5036
5037 movq ARG5, %r10 // D
5038
5039#if MACRO_LEVEL>=1
5040 INNER_STORE_L_8X8_LIB8
5041#else
5042#if defined(OS_LINUX) | defined(OS_WINDOWS)
5043 call inner_store_l_8x8_lib8
5044#elif defined(OS_MAC)
5045 callq _inner_store_l_8x8_lib8
5046#endif
5047#endif
5048
5049
5050 EPILOGUE
5051
5052 ret
5053
5054#if defined(OS_LINUX)
5055 .size kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
5056#endif
5057
5058
5059
5060
5061
5062// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
5063// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
5064
5065 .p2align 4,,15
5066#if defined(OS_LINUX)
5067 .globl kernel_spotrf_nt_l_8x8_vs_lib8
5068 .type kernel_spotrf_nt_l_8x8_vs_lib8, @function
5069kernel_spotrf_nt_l_8x8_vs_lib8:
5070#elif defined(OS_MAC)
5071 .globl _kernel_spotrf_nt_l_8x8_vs_lib8
5072_kernel_spotrf_nt_l_8x8_vs_lib8:
5073#elif defined(OS_WINDOWS)
5074 .globl kernel_spotrf_nt_l_8x8_vs_lib8
5075 .def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
5076kernel_spotrf_nt_l_8x8_vs_lib8:
5077#endif
5078
5079 PROLOGUE
5080
5081 // zero accumulation registers
5082
5083 vxorpd %ymm0, %ymm0, %ymm0
5084 vmovapd %ymm0, %ymm1
5085 vmovapd %ymm0, %ymm2
5086 vmovapd %ymm0, %ymm3
5087 vmovapd %ymm0, %ymm4
5088 vmovapd %ymm0, %ymm5
5089 vmovapd %ymm0, %ymm6
5090 vmovapd %ymm0, %ymm7
5091
5092
5093 // call inner dgemm kernel nt
5094
5095 movq ARG1, %r10
5096 movq ARG2, %r11
5097 movq ARG3, %r12
5098
5099#if MACRO_LEVEL>=2
5100 INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
5101#else
5102#if defined(OS_LINUX) | defined(OS_WINDOWS)
5103 call inner_kernel_gemm_sub_nt_8x8_lib8
5104#elif defined(OS_MAC)
5105 callq _inner_kernel_gemm_sub_nt_8x8_lib8
5106#endif
5107#endif
5108
5109
5110 // call inner blender_loader nn
5111
5112 movq ARG4, %r10 // C
5113
5114#if MACRO_LEVEL>=1
5115 INNER_BLEND_SCALE_11_8X4_LIB8
5116#else
5117#if defined(OS_LINUX) | defined(OS_WINDOWS)
5118 call inner_blend_scale_11_8x8_lib8
5119#elif defined(OS_MAC)
5120 callq _inner_blend_scale_11_8x8_lib8
5121#endif
5122#endif
5123
5124
5125 // factorization
5126
5127 movq ARG6, %r10 // inv_diag_D
5128 movq ARG8, %r11 // kn
5129
5130#if MACRO_LEVEL>=1
5131 INNER_EDGE_POTRF_8X4_VS_LIB8
5132#else
5133#if defined(OS_LINUX) | defined(OS_WINDOWS)
5134 call inner_edge_potrf_8x8_vs_lib8
5135#elif defined(OS_MAC)
5136 callq _inner_edge_potrf_8x8_vs_lib8
5137#endif
5138#endif
5139
5140
5141 // store
5142
5143 movq ARG5, %r10 // D
5144 movq ARG7, %r11 // m1
5145 movq ARG8, %r12 // n1
5146
5147#if MACRO_LEVEL>=1
5148 INNER_STORE_L_8X4_VS_LIB8
5149#else
5150#if defined(OS_LINUX) | defined(OS_WINDOWS)
5151 call inner_store_l_8x8_vs_lib8
5152#elif defined(OS_MAC)
5153 callq _inner_store_l_8x8_vs_lib8
5154#endif
5155#endif
5156
5157
5158 EPILOGUE
5159
5160 ret
5161
5162#if defined(OS_LINUX)
5163 .size kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
5164#endif
5165
5166
5167
5168
5169
5170// 1 2 3 4 5 6 7 8 9
5171// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
5172
5173 .p2align 4,,15
5174#if defined(OS_LINUX)
5175 .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
5176 .type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
5177kernel_ssyrk_spotrf_nt_l_8x8_lib8:
5178#elif defined(OS_MAC)
5179 .globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
5180_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
5181#elif defined(OS_WINDOWS)
5182 .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
5183 .def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
5184kernel_ssyrk_spotrf_nt_l_8x8_lib8:
5185#endif
5186
5187 PROLOGUE
5188
5189 // zero accumulation registers
5190
5191 vxorps %ymm0, %ymm0, %ymm0
5192 vmovaps %ymm0, %ymm1
5193 vmovaps %ymm0, %ymm2
5194 vmovaps %ymm0, %ymm3
5195 vmovaps %ymm0, %ymm4
5196 vmovaps %ymm0, %ymm5
5197 vmovaps %ymm0, %ymm6
5198 vmovaps %ymm0, %ymm7
5199
5200
5201 // call inner dgemm kernel nt add
5202
5203 movq ARG1, %r10 // kp
5204 movq ARG2, %r11 // Ap
5205 movq ARG3, %r12 // Bp
5206
5207#if MACRO_LEVEL>=2
5208 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
5209#else
5210#if defined(OS_LINUX) | defined(OS_WINDOWS)
5211 call inner_kernel_gemm_add_nt_8x8_lib8
5212#elif defined(OS_MAC)
5213 callq _inner_kernel_gemm_add_nt_8x8_lib8
5214#endif
5215#endif
5216
5217
5218 // call inner dgemm kernel nt sub
5219
5220 movq ARG4, %r10 // km
5221 movq ARG5, %r11 // Am
5222 movq ARG6, %r12 // Bm
5223
5224#if MACRO_LEVEL>=2
5225 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
5226#else
5227#if defined(OS_LINUX) | defined(OS_WINDOWS)
5228 call inner_kernel_gemm_sub_nt_8x8_lib8
5229#elif defined(OS_MAC)
5230 callq _inner_kernel_gemm_sub_nt_8x8_lib8
5231#endif
5232#endif
5233
5234
5235 // call inner blender_loader nn
5236
5237 movq ARG7, %r10 // C
5238
5239#if MACRO_LEVEL>=1
5240 INNER_BLEND_SCALE_11_8X8_LIB8
5241#else
5242#if defined(OS_LINUX) | defined(OS_WINDOWS)
5243 call inner_blend_scale_11_8x8_lib8
5244#elif defined(OS_MAC)
5245 callq _inner_blend_scale_11_8x8_lib8
5246#endif
5247#endif
5248
5249
5250 // factorization
5251
5252 movq ARG9, %r10 // inv_diag_D
5253 movl $8, %r11d
5254
5255#if MACRO_LEVEL>=1
5256 INNER_EDGE_POTRF_8X8_VS_LIB8
5257#else
5258#if defined(OS_LINUX) | defined(OS_WINDOWS)
5259 call inner_edge_potrf_8x8_vs_lib8
5260#elif defined(OS_MAC)
5261 callq _inner_edge_potrf_8x8_vs_lib8
5262#endif
5263#endif
5264
5265
5266 // store
5267
5268 movq ARG8, %r10 // D
5269
5270#if MACRO_LEVEL>=1
5271 INNER_STORE_L_8X8_LIB8
5272#else
5273#if defined(OS_LINUX) | defined(OS_WINDOWS)
5274 call inner_store_l_8x8_lib8
5275#elif defined(OS_MAC)
5276 callq _inner_store_l_8x8_lib8
5277#endif
5278#endif
5279
5280
5281 EPILOGUE
5282
5283 ret
5284
5285#if defined(OS_LINUX)
5286 .size kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
5287#endif
5288
5289
5290
5291
5292
5293// 1 2 3 4 5 6 7 8 9 10 11
5294// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
5295
5296 .p2align 4,,15
5297#if defined(OS_LINUX)
5298 .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
5299 .type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
5300kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
5301#elif defined(OS_MAC)
5302 .globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
5303_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
5304#elif defined(OS_WINDOWS)
5305 .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
5306 .def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
5307kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
5308#endif
5309
5310 PROLOGUE
5311
5312 // zero accumulation registers
5313
5314 vxorpd %ymm0, %ymm0, %ymm0
5315 vmovaps %ymm0, %ymm1
5316 vmovaps %ymm0, %ymm2
5317 vmovaps %ymm0, %ymm3
5318 vmovaps %ymm0, %ymm4
5319 vmovaps %ymm0, %ymm5
5320 vmovaps %ymm0, %ymm6
5321 vmovaps %ymm0, %ymm7
5322
5323
5324 // call inner dgemm kernel nt add
5325
5326 movq ARG1, %r10 // kp
5327 movq ARG2, %r11 // Ap
5328 movq ARG3, %r12 // Bp
5329
5330#if MACRO_LEVEL>=2
5331 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
5332#else
5333#if defined(OS_LINUX) | defined(OS_WINDOWS)
5334 call inner_kernel_gemm_add_nt_8x8_lib8
5335#elif defined(OS_MAC)
5336 callq _inner_kernel_gemm_add_nt_8x8_lib8
5337#endif
5338#endif
5339
5340
5341 // call inner dgemm kernel nt sub
5342
5343 movq ARG4, %r10 // km
5344 movq ARG5, %r11 // Am
5345 movq ARG6, %r12 // Bm
5346
5347#if MACRO_LEVEL>=2
5348 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
5349#else
5350#if defined(OS_LINUX) | defined(OS_WINDOWS)
5351 call inner_kernel_gemm_sub_nt_8x8_lib8
5352#elif defined(OS_MAC)
5353 callq _inner_kernel_gemm_sub_nt_8x8_lib8
5354#endif
5355#endif
5356
5357
5358 // call inner blender_loader nn
5359
5360 movq ARG7, %r10 // C
5361
5362#if MACRO_LEVEL>=1
5363 INNER_BLEND_SCALE_11_8X8_LIB8
5364#else
5365#if defined(OS_LINUX) | defined(OS_WINDOWS)
5366 call inner_blend_scale_11_8x8_lib8
5367#elif defined(OS_MAC)
5368 callq _inner_blend_scale_11_8x8_lib8
5369#endif
5370#endif
5371
5372
5373 // factorization
5374
5375 movq ARG9, %r10 // inv_diag_D
5376 movq ARG11, %r11 // kn
5377
5378#if MACRO_LEVEL>=1
5379 INNER_EDGE_POTRF_8X8_VS_LIB8
5380#else
5381#if defined(OS_LINUX) | defined(OS_WINDOWS)
5382 call inner_edge_potrf_8x8_vs_lib8
5383#elif defined(OS_MAC)
5384 callq _inner_edge_potrf_8x8_vs_lib8
5385#endif
5386#endif
5387
5388
5389 // store
5390
5391 movq ARG8, %r10 // D
5392 movq ARG10, %r11 // km
5393 movq ARG11, %r12 // kn
5394
5395#if MACRO_LEVEL>=1
5396 INNER_STORE_L_8X8_VS_LIB8
5397#else
5398#if defined(OS_LINUX) | defined(OS_WINDOWS)
5399 call inner_store_l_8x8_vs_lib8
5400#elif defined(OS_MAC)
5401 callq _inner_store_l_8x8_vs_lib8
5402#endif
5403#endif
5404
5405
5406 EPILOGUE
5407
5408 ret
5409
5410#if defined(OS_LINUX)
5411 .size kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
5412#endif
5413
5414
5415
5416
5417
5418 // read-only data
5419#if defined(OS_LINUX)
5420 .section .rodata.cst32,"aM",@progbits,32
5421#elif defined(OS_MAC)
5422 .section __TEXT,__const
5423#elif defined(OS_WINDOWS)
5424 .section .rdata,"dr"
5425#endif
5426
5427#if defined(OS_LINUX) | defined(OS_WINDOWS)
5428 .align 32
5429.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
5430#elif defined(OS_MAC)
5431 .align 5
5432LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
5433#endif
5434 .long 1056964608
5435 .long 1069547520
5436 .long 1075838976
5437 .long 1080033280
5438 .long 1083179008
5439 .long 1085276160
5440 .long 1087373312
5441 .long 1089470464
5442
5443#if defined(OS_LINUX) | defined(OS_WINDOWS)
5444 .align 32
5445.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
5446#elif defined(OS_MAC)
5447 .align 5
5448LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
5449#endif
5450 .long 1091043328
5451 .long 1092091904
5452 .long 1093140480
5453 .long 1094189056
5454 .long 1095237632
5455 .long 1096286208
5456 .long 1097334784
5457 .long 1098383360
5458
5459#if defined(OS_LINUX) | defined(OS_WINDOWS)
5460 .align 32
5461.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
5462#elif defined(OS_MAC)
5463 .align 5
5464LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
5465#endif
5466 .long 1099169792
5467 .long 1099694080
5468 .long 1100218368
5469 .long 1100742656
5470 .long 1101266944
5471 .long 1101791232
5472 .long 1102315520
5473 .long 1102839808
5474
5475#if defined(OS_LINUX) | defined(OS_WINDOWS)
5476 .align 32
5477.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
5478#elif defined(OS_MAC)
5479 .align 5
5480LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
5481#endif
5482 .long 1065353216
5483 .long 1065353216
5484 .long 1065353216
5485 .long 1065353216
5486 .long 1065353216
5487 .long 1065353216
5488 .long 1065353216
5489 .long 1065353216
5490
5491#if defined(OS_LINUX) | defined(OS_WINDOWS)
5492 .align 32
5493.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
5494#elif defined(OS_MAC)
5495 .align 5
5496LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
5497#endif
5498 .long 1065353216
5499 .long 1065353216
5500 .long 1065353216
5501 .long 1065353216
5502 .long 1065353216
5503 .long 1065353216
5504 .long 3212836864
5505 .long 3212836864
5506
5507
5508
5509#if defined(OS_LINUX)
5510 .section .note.GNU-stack,"",@progbits
5511#elif defined(OS_MAC)
5512 .subsections_via_symbols
5513#endif
5514