blob: aafd8cbe0f9e31557a2cb811f83034e78cb4753a [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#if defined(OS_LINUX) | defined(OS_MAC)
30
31//#define STACKSIZE 96
32#define STACKSIZE 64
33#define ARG1 %rdi
34#define ARG2 %rsi
35#define ARG3 %rdx
36#define ARG4 %rcx
37#define ARG5 %r8
38#define ARG6 %r9
39#define ARG7 STACKSIZE + 8(%rsp)
40#define ARG8 STACKSIZE + 16(%rsp)
41#define ARG9 STACKSIZE + 24(%rsp)
42#define ARG10 STACKSIZE + 32(%rsp)
43#define ARG11 STACKSIZE + 40(%rsp)
44#define ARG12 STACKSIZE + 48(%rsp)
45#define ARG13 STACKSIZE + 56(%rsp)
46#define ARG14 STACKSIZE + 64(%rsp)
47#define ARG15 STACKSIZE + 72(%rsp)
48#define ARG16 STACKSIZE + 80(%rsp)
49#define ARG17 STACKSIZE + 88(%rsp)
50#define ARG18 STACKSIZE + 96(%rsp)
51#define PROLOGUE \
52 subq $STACKSIZE, %rsp; \
53 movq %rbx, (%rsp); \
54 movq %rbp, 8(%rsp); \
55 movq %r12, 16(%rsp); \
56 movq %r13, 24(%rsp); \
57 movq %r14, 32(%rsp); \
58 movq %r15, 40(%rsp); \
59 vzeroupper;
60#define EPILOGUE \
61 vzeroupper; \
62 movq (%rsp), %rbx; \
63 movq 8(%rsp), %rbp; \
64 movq 16(%rsp), %r12; \
65 movq 24(%rsp), %r13; \
66 movq 32(%rsp), %r14; \
67 movq 40(%rsp), %r15; \
68 addq $STACKSIZE, %rsp;
69
70#elif defined(OS_WINDOWS)
71
72#define STACKSIZE 256
73#define ARG1 %rcx
74#define ARG2 %rdx
75#define ARG3 %r8
76#define ARG4 %r9
77#define ARG5 STACKSIZE + 40(%rsp)
78#define ARG6 STACKSIZE + 48(%rsp)
79#define ARG7 STACKSIZE + 56(%rsp)
80#define ARG8 STACKSIZE + 64(%rsp)
81#define ARG9 STACKSIZE + 72(%rsp)
82#define ARG10 STACKSIZE + 80(%rsp)
83#define ARG11 STACKSIZE + 88(%rsp)
84#define ARG12 STACKSIZE + 96(%rsp)
85#define ARG13 STACKSIZE + 104(%rsp)
86#define ARG14 STACKSIZE + 112(%rsp)
87#define ARG15 STACKSIZE + 120(%rsp)
88#define ARG16 STACKSIZE + 128(%rsp)
89#define ARG17 STACKSIZE + 136(%rsp)
90#define ARG18 STACKSIZE + 144(%rsp)
91#define PROLOGUE \
92 subq $STACKSIZE, %rsp; \
93 movq %rbx, (%rsp); \
94 movq %rbp, 8(%rsp); \
95 movq %r12, 16(%rsp); \
96 movq %r13, 24(%rsp); \
97 movq %r14, 32(%rsp); \
98 movq %r15, 40(%rsp); \
99 movq %rdi, 48(%rsp); \
100 movq %rsi, 56(%rsp); \
101 vmovups %xmm6, 64(%rsp); \
102 vmovups %xmm7, 80(%rsp); \
103 vmovups %xmm8, 96(%rsp); \
104 vmovups %xmm9, 112(%rsp); \
105 vmovups %xmm10, 128(%rsp); \
106 vmovups %xmm11, 144(%rsp); \
107 vmovups %xmm12, 160(%rsp); \
108 vmovups %xmm13, 176(%rsp); \
109 vmovups %xmm14, 192(%rsp); \
110 vmovups %xmm15, 208(%rsp); \
111 vzeroupper;
112#define EPILOGUE \
113 vzeroupper; \
114 movq (%rsp), %rbx; \
115 movq 8(%rsp), %rbp; \
116 movq 16(%rsp), %r12; \
117 movq 24(%rsp), %r13; \
118 movq 32(%rsp), %r14; \
119 movq 40(%rsp), %r15; \
120 movq 48(%rsp), %rdi; \
121 movq 56(%rsp), %rsi; \
122 vmovups 64(%rsp), %xmm6; \
123 vmovups 80(%rsp), %xmm7; \
124 vmovups 96(%rsp), %xmm8; \
125 vmovups 112(%rsp), %xmm9; \
126 vmovups 128(%rsp), %xmm10; \
127 vmovups 144(%rsp), %xmm11; \
128 vmovups 160(%rsp), %xmm12; \
129 vmovups 176(%rsp), %xmm13; \
130 vmovups 192(%rsp), %xmm14; \
131 vmovups 208(%rsp), %xmm15; \
132 addq $STACKSIZE, %rsp;
133
134#else
135
136#error wrong OS
137
138#endif
139
140
141
142#if defined(OS_LINUX) | defined(OS_WINDOWS)
143 .text
144#elif defined(OS_MAC)
145 .section __TEXT,__text,regular,pure_instructions
146#endif
147
148
149
150
151
152// common inner routine with file scope
153//
154// input arguments:
155// r10d <- k
156// r11 <- A
157// r12 <- x
158// ymm0 <- [z0 z1 z2 z3]_a
159// ymm1 <- [z0 z1 z2 z3]_b
160// ymm2 <- [z0 z1 z2 z3]_c
161// ymm3 <- [z0 z1 z2 z3]_d
162// ymm8 <- dirty
163// ymm9 <- dirty
164// ymm10 <- dirty
165// ymm11 <- dirty
166// ymm12 <- dirty
167// ymm13 <- dirty
168// ymm14 <- dirty
169// ymm15 <- dirty
170
171//
172// output arguments:
173// r10d <- 0
174// r11 <- A+4*k*sizeof(double)
175// r12 <- x+k*sizeof(double)
176// ymm0 <- [z0 z1 z2 z3]_a
177// ymm1 <- [z0 z1 z2 z3]_b
178// ymm2 <- [z0 z1 z2 z3]_c
179// ymm3 <- [z0 z1 z2 z3]_d
180// ymm8 <- dirty
181// ymm9 <- dirty
182// ymm10 <- dirty
183// ymm11 <- dirty
184// ymm12 <- dirty
185// ymm13 <- dirty
186// ymm14 <- dirty
187// ymm15 <- dirty
188
189#if MACRO_LEVEL>=2
190 .macro INNER_KERNEL_GEMV_ADD_N_8_LIB8
191#else
192 .p2align 4,,15
193#if defined(OS_LINUX)
194 .type inner_kernel_gemv_add_n_8_lib8, @function
195inner_kernel_gemv_add_n_8_lib8:
196#elif defined(OS_MAC)
197_inner_kernel_gemv_add_n_8_lib8:
198#elif defined(OS_WINDOWS)
199 .def inner_kernel_gemv_add_n_8_lib8; .scl 2; .type 32; .endef
200inner_kernel_gemv_add_n_8_lib8:
201#endif
202#endif
203
204 cmpl $0, %r10d
205 jle 2f // return
206
207 cmpl $4, %r10d
208 jl 0f // clean-up loop
209
210 // main loop
211 .p2align 3
2121: // main loop
213
214 vmovaps 0(%r11), %ymm8
215 vbroadcastss 0(%r12), %ymm12
216 vmulps %ymm8, %ymm12, %ymm15
217 vaddps %ymm0, %ymm15, %ymm0
218
219 subl $4, %r10d
220
221 vmovaps 32(%r11), %ymm8
222 vbroadcastss 4(%r12), %ymm12
223 vmulps %ymm8, %ymm12, %ymm15
224 vaddps %ymm1, %ymm15, %ymm1
225
226 vmovaps 64(%r11), %ymm8
227 vbroadcastss 8(%r12), %ymm12
228 vmulps %ymm8, %ymm12, %ymm15
229 vaddps %ymm2, %ymm15, %ymm2
230
231 vmovaps 96(%r11), %ymm8
232 vbroadcastss 12(%r12), %ymm12
233 vmulps %ymm8, %ymm12, %ymm15
234 vaddps %ymm3, %ymm15, %ymm3
235
236 addq $128, %r11
237 addq $16, %r12
238
239 cmpl $3, %r10d
240
241 jg 1b // main loop
242
243
244 // consider clean-up
245 cmpl $0, %r10d
246 jle 2f // return
247
2480: // clean-up
249
250 vmovaps 0(%r11), %ymm8
251 vbroadcastss 0(%r12), %ymm12
252 vmulps %ymm8, %ymm12, %ymm15
253 vaddps %ymm0, %ymm15, %ymm0
254
255 addq $32, %r11
256 addq $4, %r12
257
258 subl $1, %r10d
259 cmpl $0, %r10d
260
261 jg 0b // clean
262
2632: // return
264
265#if MACRO_LEVEL>=2
266 .endm
267#else
268 ret
269
270#if defined(OS_LINUX)
271 .size inner_kernel_gemv_add_n_8_lib8, .-inner_kernel_gemv_add_n_8_lib8
272#endif
273#endif
274
275
276
277
278
279// common inner routine with file scope
280//
281// input arguments:
282// r10d <- k
283// r11 <- A
284// r12 <- bs*sda*sizeof(double) = 32*sda
285// r13 <- x
286// ymm0 <- [z0a z0b z0c z0d]
287// ymm1 <- [z1a z1b z1c z1d]
288// ymm2 <- [z2a z2b z2c z2d]
289// ymm3 <- [z3a z3b z3c z3d]
290// ymm8 <- dirty
291// ymm9 <- dirty
292// ymm10 <- dirty
293// ymm11 <- dirty
294// ymm12 <- dirty
295// ymm13 <- dirty
296// ymm14 <- dirty
297// ymm15 <- dirty
298
299//
300// output arguments:
301// r10d <- 0
302// r11 <- A+4*k*sizeof(double)
303// r12 <- bs*sda*sizeof(double) = 32*sda
304// r13 <- x+k*sizeof(double)
305// ymm0 <- [z0a z0b z0c z0d]
306// ymm1 <- [z1a z1b z1c z1d]
307// ymm2 <- [z2a z2b z2c z2d]
308// ymm3 <- [z3a z3b z3c z3d]
309// ymm8 <- dirty
310// ymm9 <- dirty
311// ymm10 <- dirty
312// ymm11 <- dirty
313// ymm12 <- dirty
314// ymm13 <- dirty
315// ymm14 <- dirty
316// ymm15 <- dirty
317
318#if MACRO_LEVEL>=2
319 .macro INNER_KERNEL_GEMV_ADD_T_8_LIB8
320#else
321 .p2align 4,,15
322#if defined(OS_LINUX)
323 .type inner_kernel_gemv_add_t_8_lib8, @function
324inner_kernel_gemv_add_t_8_lib8:
325#elif defined(OS_MAC)
326_inner_kernel_gemv_add_t_8_lib8:
327#elif defined(OS_WINDOWS)
328 .def inner_kernel_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
329inner_kernel_gemv_add_t_8_lib8:
330#endif
331#endif
332
333 cmpl $0, %r10d
334 jle 2f // return
335
336 cmpl $8, %r10d
337 jl 0f // clean-up loop
338
339 // main loop
340 .p2align 3
3411: // main loop
342
343 vmovups 0(%r13), %ymm12
344
345 vmovaps 0(%r11), %ymm8
346 vmulps %ymm8, %ymm12, %ymm15
347 vaddps %ymm0, %ymm15, %ymm0
348
349 subl $8, %r10d
350
351 vmovaps 32(%r11), %ymm8
352 vmulps %ymm8, %ymm12, %ymm15
353 vaddps %ymm1, %ymm15, %ymm1
354
355 vmovaps 64(%r11), %ymm8
356 vmulps %ymm8, %ymm12, %ymm15
357 vaddps %ymm2, %ymm15, %ymm2
358
359 vmovaps 96(%r11), %ymm8
360 vmulps %ymm8, %ymm12, %ymm15
361 vaddps %ymm3, %ymm15, %ymm3
362
363 vmovaps 128(%r11), %ymm8
364 vmulps %ymm8, %ymm12, %ymm15
365 vaddps %ymm4, %ymm15, %ymm4
366
367 vmovaps 160(%r11), %ymm8
368 vmulps %ymm8, %ymm12, %ymm15
369 vaddps %ymm5, %ymm15, %ymm5
370
371 vmovaps 192(%r11), %ymm8
372 vmulps %ymm8, %ymm12, %ymm15
373 vaddps %ymm6, %ymm15, %ymm6
374
375 vmovaps 224(%r11), %ymm8
376 vmulps %ymm8, %ymm12, %ymm15
377 vaddps %ymm7, %ymm15, %ymm7
378
379 addq %r12, %r11
380 addq $32, %r13
381
382 cmpl $7, %r10d
383
384 jg 1b // main loop
385
386
387 // consider clean-up
388 cmpl $0, %r10d
389 jle 2f // return
390
3910: // clean-up
392
393 vcvtsi2ss %r10d, %xmm14, %xmm14
394#if defined(OS_LINUX) | defined(OS_WINDOWS)
395 vmovups .LC00(%rip), %ymm13
396#elif defined(OS_MAC)
397 vmovups LC00(%rip), %ymm13
398#endif
399 vshufps $0x00, %xmm14, %xmm14, %xmm14
400 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
401 vsubps %ymm14, %ymm13, %ymm14
402
403 vmaskmovps 0(%r13), %ymm14, %ymm12
404
405 vmaskmovps 0(%r11), %ymm14, %ymm8
406 vmulps %ymm8, %ymm12, %ymm15
407 vaddps %ymm0, %ymm15, %ymm0
408
409 vmaskmovps 32(%r11), %ymm14, %ymm8
410 vmulps %ymm8, %ymm12, %ymm15
411 vaddps %ymm1, %ymm15, %ymm1
412
413 vmaskmovps 64(%r11), %ymm14, %ymm8
414 vmulps %ymm8, %ymm12, %ymm15
415 vaddps %ymm2, %ymm15, %ymm2
416
417 vmaskmovps 96(%r11), %ymm14, %ymm8
418 vmulps %ymm8, %ymm12, %ymm15
419 vaddps %ymm3, %ymm15, %ymm3
420
421 vmaskmovps 128(%r11), %ymm14, %ymm8
422 vmulps %ymm8, %ymm12, %ymm15
423 vaddps %ymm4, %ymm15, %ymm4
424
425 vmaskmovps 160(%r11), %ymm14, %ymm8
426 vmulps %ymm8, %ymm12, %ymm15
427 vaddps %ymm5, %ymm15, %ymm5
428
429 vmaskmovps 192(%r11), %ymm14, %ymm8
430 vmulps %ymm8, %ymm12, %ymm15
431 vaddps %ymm6, %ymm15, %ymm6
432
433 vmaskmovps 224(%r11), %ymm14, %ymm8
434 vmulps %ymm8, %ymm12, %ymm15
435 vaddps %ymm7, %ymm15, %ymm7
436
437 sall $2, %r10d
438 addq %r10, %r11
439 addq %r10, %r13
440 xorl %r10d, %r10d
441
442
4432: // return
444
445#if MACRO_LEVEL>=2
446 .endm
447#else
448 ret
449
450#if defined(OS_LINUX)
451 .size inner_kernel_gemv_add_t_8_lib8, .-inner_kernel_gemv_add_t_8_lib8
452#endif
453#endif
454
455
456
457
458
459// common inner routine with file scope
460//
461// input arguments:
462// r10d <- k
463// r11 <- A
464// r12 <- bs*sda*sizeof(double) = 32*sda
465// r13 <- x
466// r14d <- offA
467// ymm0 <- [z0a z0b z0c z0d]
468// ymm1 <- [z1a z1b z1c z1d]
469// ymm2 <- [z2a z2b z2c z2d]
470// ymm3 <- [z3a z3b z3c z3d]
471// ymm8 <- dirty
472// ymm9 <- dirty
473// ymm10 <- dirty
474// ymm11 <- dirty
475// ymm12 <- dirty
476// ymm13 <- dirty
477// ymm14 <- dirty
478// ymm15 <- dirty
479
480//
481// output arguments:
482// r10d <-
483// r11 <-
484// r12 <-
485// r13 <-
486// r14d <- offA
487// ymm0 <- [z0a z0b z0c z0d]
488// ymm1 <- [z1a z1b z1c z1d]
489// ymm2 <- [z2a z2b z2c z2d]
490// ymm3 <- [z3a z3b z3c z3d]
491// ymm8 <- dirty
492// ymm9 <- dirty
493// ymm10 <- dirty
494// ymm11 <- dirty
495// ymm12 <- dirty
496// ymm13 <- dirty
497// ymm14 <- dirty
498// ymm15 <- dirty
499
500#if MACRO_LEVEL>=2
501 .macro INNER_EDGE_GEMV_ADD_T_8_LIB8
502#else
503 .p2align 4,,15
504#if defined(OS_LINUX)
505 .type inner_edge_gemv_add_t_8_lib8, @function
506inner_edge_gemv_add_t_8_lib8:
507#elif defined(OS_MAC)
508_inner_edge_gemv_add_t_8_lib8:
509#elif defined(OS_WINDOWS)
510 .def inner_edge_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
511inner_edge_gemv_add_t_8_lib8:
512#endif
513#endif
514
515 cmpl $0, %r14d
516 jle 0f // return
517
518 movl %r14d, %r15d
519 sall $2, %r15d // offA*sizeof(float)
520
521 subq %r15, %r11 // A - offA
522 subq %r15, %r13 // x - offA
523
524 movl %r10d, %r15d // kmax
525 addl %r14d, %r15d // kmax + offA
526
527 vcvtsi2ss %r14d, %xmm14, %xmm14 // offA
528 vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax
529#if defined(OS_LINUX) | defined(OS_WINDOWS)
530 vmovups .LC00(%rip), %ymm13
531#elif defined(OS_MAC)
532 vmovups LC00(%rip), %ymm13
533#endif
534 vshufps $0x00, %xmm14, %xmm14, %xmm14
535 vshufps $0x00, %xmm15, %xmm15, %xmm15
536 vinsertf128 $1, %xmm14, %ymm14, %ymm14
537 vinsertf128 $1, %xmm15, %ymm15, %ymm15
538 vsubps %ymm13, %ymm14, %ymm14
539 vsubps %ymm15, %ymm13, %ymm15
540 vandps %ymm15, %ymm14, %ymm14
541
542 vmaskmovps 0(%r13), %ymm14, %ymm12
543
544 vmovaps 0(%r11), %ymm8
545 vmulps %ymm8, %ymm12, %ymm15
546 vaddps %ymm0, %ymm15, %ymm0
547
548 vmovaps 32(%r11), %ymm8
549 vmulps %ymm8, %ymm12, %ymm15
550 vaddps %ymm1, %ymm15, %ymm1
551
552 vmovaps 64(%r11), %ymm8
553 vmulps %ymm8, %ymm12, %ymm15
554 vaddps %ymm2, %ymm15, %ymm2
555
556 vmovaps 96(%r11), %ymm8
557 vmulps %ymm8, %ymm12, %ymm15
558 vaddps %ymm3, %ymm15, %ymm3
559
560 vmovaps 128(%r11), %ymm8
561 vmulps %ymm8, %ymm12, %ymm15
562 vaddps %ymm4, %ymm15, %ymm4
563
564 vmovaps 160(%r11), %ymm8
565 vmulps %ymm8, %ymm12, %ymm15
566 vaddps %ymm5, %ymm15, %ymm5
567
568 vmovaps 192(%r11), %ymm8
569 vmulps %ymm8, %ymm12, %ymm15
570 vaddps %ymm6, %ymm15, %ymm6
571
572 vmovaps 224(%r11), %ymm8
573 vmulps %ymm8, %ymm12, %ymm15
574 vaddps %ymm7, %ymm15, %ymm7
575
576 addq $32, %r13 // x + 4
577 addq %r12, %r11 // A + bs*sda
578
579 addl %r14d, %r10d
580 subl $8, %r10d // kmax - (8-offA)
581
5820: // return
583
584#if MACRO_LEVEL>=2
585 .endm
586#else
587 ret
588
589#if defined(OS_LINUX)
590 .size inner_edge_gemv_add_t_8_lib8, .-inner_edge_gemv_add_t_8_lib8
591#endif
592#endif
593
594
595
596
597
598// common inner routine with file scope
599//
600// triangular substitution with vector RHS
601//
602// input arguments:
603// r10 <- E
604// r11 <- inv_diag_E
605// ymm0 <- [z0 z1 z2 z3]
606// ymm12 <- dirty
607// ymm13 <- dirty
608//
609// output arguments:
610// r10 <- E
611// r11 <- inv_diag_E
612// ymm0 <- [z0 z1 z2 z3]
613// ymm12 <- dirty
614// ymm13 <- dirty
615
616#if MACRO_LEVEL>=1
617 .macro INNER_EDGE_TRSV_LN_INV_8_LIB8
618#else
619 .p2align 4,,15
620#if defined(OS_LINUX)
621 .type inner_edge_trsv_ln_inv_8_lib8, @function
622inner_edge_trsv_ln_inv_8_lib8:
623#elif defined(OS_MAC)
624_inner_edge_trsv_ln_inv_8_lib8:
625#elif defined(OS_WINDOWS)
626 .def inner_edge_trsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
627inner_edge_trsv_ln_inv_8_lib8:
628#endif
629#endif
630
631 vxorps %ymm14, %ymm14, %ymm14
632
633 vbroadcastss 0(%r11), %ymm12
634 vmulps %ymm0, %ymm12, %ymm1
635 vblendps $0x01, %ymm1, %ymm0, %ymm0
636
637 vmovaps 0(%r10), %ymm13
638 vblendps $0x01, %ymm14, %ymm13, %ymm13
639 vpermilps $0x00, %ymm0, %ymm12
640 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
641 vmulps %ymm13, %ymm12, %ymm15
642 vsubps %ymm15, %ymm0, %ymm0
643 vbroadcastss 4(%r11), %ymm12
644 vmulps %ymm0, %ymm12, %ymm1
645 vblendps $0x02, %ymm1, %ymm0, %ymm0
646
647 vmovaps 32(%r10), %ymm13
648 vblendps $0x03, %ymm14, %ymm13, %ymm13
649 vpermilps $0x55, %ymm0, %ymm12
650 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
651 vmulps %ymm13, %ymm12, %ymm15
652 vsubps %ymm15, %ymm0, %ymm0
653 vbroadcastss 8(%r11), %ymm12
654 vmulps %ymm0, %ymm12, %ymm1
655 vblendps $0x04, %ymm1, %ymm0, %ymm0
656
657 vmovaps 64(%r10), %ymm13
658 vblendps $0x07, %ymm14, %ymm13, %ymm13
659 vpermilps $0xaa, %ymm0, %ymm12
660 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
661 vmulps %ymm13, %ymm12, %ymm15
662 vsubps %ymm15, %ymm0, %ymm0
663 vbroadcastss 12(%r11), %ymm12
664 vmulps %ymm0, %ymm12, %ymm1
665 vblendps $0x08, %ymm1, %ymm0, %ymm0
666
667 vmovaps 96(%r10), %ymm13
668 vblendps $0x0f, %ymm14, %ymm13, %ymm13
669 vpermilps $0xff, %ymm0, %ymm12
670 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
671 vmulps %ymm13, %ymm12, %ymm15
672 vsubps %ymm15, %ymm0, %ymm0
673 vbroadcastss 16(%r11), %ymm12
674 vmulps %ymm0, %ymm12, %ymm1
675 vblendps $0x10, %ymm1, %ymm0, %ymm0
676
677 vmovaps 128(%r10), %ymm13
678 vblendps $0x1f, %ymm14, %ymm13, %ymm13
679 vpermilps $0x00, %ymm0, %ymm12
680 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
681 vmulps %ymm13, %ymm12, %ymm15
682 vsubps %ymm15, %ymm0, %ymm0
683 vbroadcastss 20(%r11), %ymm12
684 vmulps %ymm0, %ymm12, %ymm1
685 vblendps $0x20, %ymm1, %ymm0, %ymm0
686
687 vmovaps 160(%r10), %ymm13
688 vblendps $0x3f, %ymm14, %ymm13, %ymm13
689 vpermilps $0x55, %ymm0, %ymm12
690 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
691 vmulps %ymm13, %ymm12, %ymm15
692 vsubps %ymm15, %ymm0, %ymm0
693 vbroadcastss 24(%r11), %ymm12
694 vmulps %ymm0, %ymm12, %ymm1
695 vblendps $0x40, %ymm1, %ymm0, %ymm0
696
697 vmovaps 192(%r10), %ymm13
698 vblendps $0x7f, %ymm14, %ymm13, %ymm13
699 vpermilps $0xaa, %ymm0, %ymm12
700 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
701 vmulps %ymm13, %ymm12, %ymm15
702 vsubps %ymm15, %ymm0, %ymm0
703 vbroadcastss 28(%r11), %ymm12
704 vmulps %ymm0, %ymm12, %ymm1
705 vblendps $0x80, %ymm1, %ymm0, %ymm0
706
707#if MACRO_LEVEL>=1
708 .endm
709#else
710 ret
711
712#if defined(OS_LINUX)
713 .size inner_edge_trsv_ln_inv_8_lib8, .-inner_edge_trsv_ln_inv_8_lib8
714#endif
715#endif
716
717
718
719
720
721// common inner routine with file scope
722//
723// triangular substitution with vector RHS
724//
725// input arguments:
726// r10 <- E
727// r11 <- inv_diag_E
728// r12d <- kn
729// ymm0 <- [z0 z1 z2 z3]
730// ymm12 <- dirty
731// ymm13 <- dirty
732//
733// output arguments:
734// r10 <- E
735// r11 <- inv_diag_E
736// r12d <- kn
737// ymm0 <- [z0 z1 z2 z3]
738// ymm12 <- dirty
739// ymm13 <- dirty
740
741#if MACRO_LEVEL>=1
742 .macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
743#else
744 .p2align 4,,15
745#if defined(OS_LINUX)
746 .type inner_edge_trsv_ln_inv_8_vs_lib8, @function
747inner_edge_trsv_ln_inv_8_vs_lib8:
748#elif defined(OS_MAC)
749_inner_edge_trsv_ln_inv_8_vs_lib8:
750#elif defined(OS_WINDOWS)
751 .def inner_edge_trsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
752inner_edge_trsv_ln_inv_8_vs_lib8:
753#endif
754#endif
755
756 vxorps %ymm14, %ymm14, %ymm14
757
758 vbroadcastss 0(%r11), %ymm12
759 vmulps %ymm0, %ymm12, %ymm1
760 vblendps $0x01, %ymm1, %ymm0, %ymm0
761 vmovaps 0(%r10), %ymm13
762 vblendps $0x01, %ymm14, %ymm13, %ymm13
763 vpermilps $0x00, %ymm0, %ymm12
764 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
765 vmulps %ymm13, %ymm12, %ymm15
766 vsubps %ymm15, %ymm0, %ymm0
767
768 cmpl $2, %r12d
769 jl 0f // ret
770
771 vbroadcastss 4(%r11), %ymm12
772 vmulps %ymm0, %ymm12, %ymm1
773 vblendps $0x02, %ymm1, %ymm0, %ymm0
774 vmovaps 32(%r10), %ymm13
775 vblendps $0x03, %ymm14, %ymm13, %ymm13
776 vpermilps $0x55, %ymm0, %ymm12
777 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
778 vmulps %ymm13, %ymm12, %ymm15
779 vsubps %ymm15, %ymm0, %ymm0
780
781 cmpl $3, %r12d
782 jl 0f // ret
783
784 vbroadcastss 8(%r11), %ymm12
785 vmulps %ymm0, %ymm12, %ymm1
786 vblendps $0x04, %ymm1, %ymm0, %ymm0
787 vmovaps 64(%r10), %ymm13
788 vblendps $0x07, %ymm14, %ymm13, %ymm13
789 vpermilps $0xaa, %ymm0, %ymm12
790 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
791 vmulps %ymm13, %ymm12, %ymm15
792 vsubps %ymm15, %ymm0, %ymm0
793
794 cmpl $4, %r12d
795 jl 0f // ret
796
797 vbroadcastss 12(%r11), %ymm12
798 vmulps %ymm0, %ymm12, %ymm1
799 vblendps $0x08, %ymm1, %ymm0, %ymm0
800 vmovaps 96(%r10), %ymm13
801 vblendps $0x0f, %ymm14, %ymm13, %ymm13
802 vpermilps $0xff, %ymm0, %ymm12
803 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
804 vmulps %ymm13, %ymm12, %ymm15
805 vsubps %ymm15, %ymm0, %ymm0
806
807 cmpl $5, %r12d
808 jl 0f // ret
809
810 vbroadcastss 16(%r11), %ymm12
811 vmulps %ymm0, %ymm12, %ymm1
812 vblendps $0x10, %ymm1, %ymm0, %ymm0
813 vmovaps 128(%r10), %ymm13
814 vblendps $0x1f, %ymm14, %ymm13, %ymm13
815 vpermilps $0x00, %ymm0, %ymm12
816 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
817 vmulps %ymm13, %ymm12, %ymm15
818 vsubps %ymm15, %ymm0, %ymm0
819
820 cmpl $6, %r12d
821 jl 0f // ret
822
823 vbroadcastss 20(%r11), %ymm12
824 vmulps %ymm0, %ymm12, %ymm1
825 vblendps $0x20, %ymm1, %ymm0, %ymm0
826 vmovaps 160(%r10), %ymm13
827 vblendps $0x3f, %ymm14, %ymm13, %ymm13
828 vpermilps $0x55, %ymm0, %ymm12
829 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
830 vmulps %ymm13, %ymm12, %ymm15
831 vsubps %ymm15, %ymm0, %ymm0
832
833 cmpl $7, %r12d
834 jl 0f // ret
835
836 vbroadcastss 24(%r11), %ymm12
837 vmulps %ymm0, %ymm12, %ymm1
838 vblendps $0x40, %ymm1, %ymm0, %ymm0
839 vmovaps 192(%r10), %ymm13
840 vblendps $0x7f, %ymm14, %ymm13, %ymm13
841 vpermilps $0xaa, %ymm0, %ymm12
842 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
843 vmulps %ymm13, %ymm12, %ymm15
844 vsubps %ymm15, %ymm0, %ymm0
845
846 cmpl $8, %r12d
847 jl 0f // ret
848
849 vbroadcastss 28(%r11), %ymm12
850 vmulps %ymm0, %ymm12, %ymm1
851 vblendps $0x80, %ymm1, %ymm0, %ymm0
852
8530:
854
855#if MACRO_LEVEL>=1
856 .endm
857#else
858 ret
859
860#if defined(OS_LINUX)
861 .size inner_edge_trsv_ln_inv_8_vs_lib8, .-inner_edge_trsv_ln_inv_8_vs_lib8
862#endif
863#endif
864
865
866
867
868
869// common inner routine with file scope
870//
871// triangular substitution with vector RHS
872//
873// input arguments:
874// r10 <- E
875// r11 <- inv_diag_E
876// ymm0 <- [z0 z1 z2 z3]
877// ymm12 <- dirty
878// ymm13 <- dirty
879//
880// output arguments:
881// r10 <- E
882// r11 <- inv_diag_E
883// ymm0 <- [z0 z1 z2 z3]
884// ymm12 <- dirty
885// ymm13 <- dirty
886
887#if MACRO_LEVEL>=1
888 .macro INNER_EDGE_TRSV_LT_INV_8_LIB8
889#else
890 .p2align 4,,15
891#if defined(OS_LINUX)
892 .type inner_edge_trsv_lt_inv_8_lib8, @function
893inner_edge_trsv_lt_inv_8_lib8:
894#elif defined(OS_MAC)
895_inner_edge_trsv_lt_inv_8_lib8:
896#elif defined(OS_WINDOWS)
897 .def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
898inner_edge_trsv_lt_inv_8_lib8:
899#endif
900#endif
901
902 vxorps %ymm14, %ymm14, %ymm14
903
904 vmovaps 0(%r10), %ymm12
905 vblendps $0x01, %ymm14, %ymm12, %ymm12
906 vmovaps 32(%r10), %ymm13
907 vblendps $0x03, %ymm14, %ymm13, %ymm13
908 vunpcklps %ymm13, %ymm12, %ymm8
909 vunpckhps %ymm13, %ymm12, %ymm9
910
911 vmovaps 64(%r10), %ymm12
912 vblendps $0x07, %ymm14, %ymm12, %ymm12
913 vmovaps 96(%r10), %ymm13
914 vblendps $0x0f, %ymm14, %ymm13, %ymm13
915 vunpcklps %ymm13, %ymm12, %ymm10
916 vunpckhps %ymm13, %ymm12, %ymm11
917
918 vshufps $0x44, %ymm10, %ymm8, %ymm7
919 vshufps $0xee, %ymm10, %ymm8, %ymm4
920 vshufps $0x44, %ymm11, %ymm9, %ymm5
921 vshufps $0xee, %ymm11, %ymm9, %ymm6
922 vextractf128 $0x1, %ymm7, %xmm7
923 vextractf128 $0x1, %ymm4, %xmm8
924 vextractf128 $0x1, %ymm5, %xmm9
925 vextractf128 $0x1, %ymm6, %xmm10
926
927 vmovaps 144(%r10), %xmm12
928 vblendps $0x01, %xmm14, %xmm12, %xmm12
929 vmovaps 176(%r10), %xmm13
930 vblendps $0x03, %xmm14, %xmm13, %xmm13
931 vunpcklps %xmm13, %xmm12, %xmm1
932 vunpckhps %xmm13, %xmm12, %xmm2
933
934 vmovaps 208(%r10), %xmm12
935 vblendps $0x07, %xmm14, %xmm12, %xmm12
936 vmovaps 240(%r10), %xmm13
937 vblendps $0x0f, %xmm14, %xmm13, %xmm13
938 vunpcklps %xmm13, %xmm12, %xmm3
939 vunpckhps %xmm13, %xmm12, %xmm15
940
941 vshufps $0xee, %xmm3, %xmm1, %xmm11
942 vshufps $0x44, %xmm15, %xmm2, %xmm12
943 vshufps $0xee, %xmm15, %xmm2, %xmm13
944
945
946 vxorps %ymm14, %ymm14, %ymm14
947
948 vextractf128 $0x1, %ymm0, %xmm1
949
950 vshufps $0xff, %xmm1, %xmm1, %xmm2
951 vbroadcastss 28(%r11), %xmm15
952 vmulps %xmm2, %xmm15, %xmm2
953 vblendps $0x08, %xmm2, %xmm1, %xmm1
954 vmulps %xmm10, %xmm2, %xmm15
955 vsubps %xmm15, %xmm0, %xmm0
956 vmulps %xmm13, %xmm2, %xmm15
957 vsubps %xmm15, %xmm1, %xmm1
958
959 vshufps $0xaa, %xmm1, %xmm1, %xmm2
960 vbroadcastss 24(%r11), %xmm15
961 vmulps %xmm2, %xmm15, %xmm2
962 vblendps $0x04, %xmm2, %xmm1, %xmm1
963 vmulps %xmm9, %xmm2, %xmm15
964 vsubps %xmm15, %xmm0, %xmm0
965 vmulps %xmm12, %xmm2, %xmm15
966 vsubps %xmm15, %xmm1, %xmm1
967
968 vshufps $0x55, %xmm1, %xmm1, %xmm2
969 vbroadcastss 20(%r11), %xmm15
970 vmulps %xmm2, %xmm15, %xmm2
971 vblendps $0x02, %xmm2, %xmm1, %xmm1
972 vmulps %xmm8, %xmm2, %xmm15
973 vsubps %xmm15, %xmm0, %xmm0
974 vmulps %xmm11, %xmm2, %xmm15
975 vsubps %xmm15, %xmm1, %xmm1
976
977 vshufps $0x00, %xmm1, %xmm1, %xmm2
978 vbroadcastss 16(%r11), %xmm15
979 vmulps %xmm2, %xmm15, %xmm2
980 vblendps $0x01, %xmm2, %xmm1, %xmm1
981 vmulps %xmm7, %xmm2, %xmm15
982 vsubps %xmm15, %xmm0, %xmm0
983
984 vshufps $0xff, %xmm0, %xmm0, %xmm2
985 vbroadcastss 12(%r11), %xmm15
986 vmulps %xmm2, %xmm15, %xmm2
987 vblendps $0x08, %xmm2, %xmm0, %xmm0
988 vmulps %xmm6, %xmm2, %xmm15
989 vsubps %xmm15, %xmm0, %xmm0
990
991 vshufps $0xaa, %xmm0, %xmm0, %xmm2
992 vbroadcastss 8(%r11), %xmm15
993 vmulps %xmm2, %xmm15, %xmm2
994 vblendps $0x04, %xmm2, %xmm0, %xmm0
995 vmulps %xmm5, %xmm2, %xmm15
996 vsubps %xmm15, %xmm0, %xmm0
997
998 vshufps $0x55, %xmm0, %xmm0, %xmm2
999 vbroadcastss 4(%r11), %xmm15
1000 vmulps %xmm2, %xmm15, %xmm2
1001 vblendps $0x02, %xmm2, %xmm0, %xmm0
1002 vmulps %xmm4, %xmm2, %xmm15
1003 vsubps %xmm15, %xmm0, %xmm0
1004
1005 vshufps $0x00, %xmm0, %xmm0, %xmm2
1006 vbroadcastss 0(%r11), %xmm15
1007 vmulps %xmm2, %xmm15, %xmm2
1008 vblendps $0x01, %xmm2, %xmm0, %xmm0
1009
1010 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
1011
1012#if MACRO_LEVEL>=1
1013 .endm
1014#else
1015 ret
1016
1017#if defined(OS_LINUX)
1018 .size inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
1019#endif
1020#endif
1021
1022
1023
1024
1025
1026// common inner routine with file scope
1027//
1028// triangular substitution with vector RHS
1029//
1030// input arguments:
1031// r10 <- E
1032// r11 <- inv_diag_E
1033// r12 <- km
1034// r13 <- kn
1035// r14 <- x
1036// ymm0 <- [z0 z1 z2 z3]
1037// ymm12 <- dirty
1038// ymm13 <- dirty
1039//
1040// output arguments:
1041// r10 <- E
1042// r11 <- inv_diag_E
1043// r12 <- km
1044// r13 <- kn
1045// r14 <- x
1046// ymm0 <- [z0 z1 z2 z3]
1047// ymm12 <- dirty
1048// ymm13 <- dirty
1049
1050#if MACRO_LEVEL>=1
1051 .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
1052#else
1053 .p2align 4,,15
1054#if defined(OS_LINUX)
1055 .type inner_edge_trsv_lt_inv_8_vs_lib8, @function
1056inner_edge_trsv_lt_inv_8_vs_lib8:
1057#elif defined(OS_MAC)
1058_inner_edge_trsv_lt_inv_8_vs_lib8:
1059#elif defined(OS_WINDOWS)
1060 .def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
1061inner_edge_trsv_lt_inv_8_vs_lib8:
1062#endif
1063#endif
1064
1065 vcvtsi2ss %r13d, %xmm14, %xmm14
1066#if defined(OS_LINUX) | defined(OS_WINDOWS)
1067 vmovups .LC00(%rip), %ymm13
1068#elif defined(OS_MAC)
1069 vmovups LC00(%rip), %ymm13
1070#endif
1071 vshufps $0x00, %xmm14, %xmm14, %xmm14
1072 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
1073 vsubps %ymm14, %ymm13, %ymm14
1074
1075 vmovups 0(%r14), %ymm15
1076 vblendvps %ymm14, %ymm0, %ymm15, %ymm0
1077
1078
1079
1080 vxorps %ymm14, %ymm14, %ymm14
1081
1082 vmovaps 0(%r10), %ymm12
1083 vblendps $0x01, %ymm14, %ymm12, %ymm12
1084 cmpl $2, %r13d
1085 jl 1f
1086 vmovaps 32(%r10), %ymm13
1087 vblendps $0x03, %ymm14, %ymm13, %ymm13
1088 vunpcklps %ymm13, %ymm12, %ymm8
1089 vunpckhps %ymm13, %ymm12, %ymm9
1090
1091 cmpl $3, %r13d
1092 jl 2f
1093 vmovaps 64(%r10), %ymm12
1094 vblendps $0x07, %ymm14, %ymm12, %ymm12
1095 cmpl $4, %r13d
1096 jl 3f
1097 vmovaps 96(%r10), %ymm13
1098 vblendps $0x0f, %ymm14, %ymm13, %ymm13
1099 vunpcklps %ymm13, %ymm12, %ymm10
1100 vunpckhps %ymm13, %ymm12, %ymm11
1101
1102 vshufps $0x44, %ymm10, %ymm8, %ymm7
1103 vshufps $0xee, %ymm10, %ymm8, %ymm4
1104 vshufps $0x44, %ymm11, %ymm9, %ymm5
1105 vshufps $0xee, %ymm11, %ymm9, %ymm6
1106 vextractf128 $0x1, %ymm7, %xmm7
1107 vextractf128 $0x1, %ymm4, %xmm8
1108 vextractf128 $0x1, %ymm5, %xmm9
1109 vextractf128 $0x1, %ymm6, %xmm10
1110
1111 cmpl $5, %r13d
1112 jl 4f
1113 vmovaps 144(%r10), %xmm12
1114 vblendps $0x01, %xmm14, %xmm12, %xmm12
1115 cmpl $6, %r13d
1116 jl 5f
1117 vmovaps 176(%r10), %xmm13
1118 vblendps $0x03, %xmm14, %xmm13, %xmm13
1119 vunpcklps %xmm13, %xmm12, %xmm1
1120 vunpckhps %xmm13, %xmm12, %xmm2
1121
1122 cmpl $7, %r13d
1123 jl 6f
1124 vmovaps 208(%r10), %xmm12
1125 vblendps $0x07, %xmm14, %xmm12, %xmm12
1126 cmpl $8, %r13d
1127 jl 7f
1128 vmovaps 240(%r10), %xmm13
1129 vblendps $0x0f, %xmm14, %xmm13, %xmm13
1130 vunpcklps %xmm13, %xmm12, %xmm3
1131 vunpckhps %xmm13, %xmm12, %xmm15
1132
1133 vshufps $0xee, %xmm3, %xmm1, %xmm11
1134 vshufps $0x44, %xmm15, %xmm2, %xmm12
1135 vshufps $0xee, %xmm15, %xmm2, %xmm13
1136
1137 jmp 0f
1138
1139
1140
1141 vmovaps %ymm14, %ymm12
11421:
1143 vmovaps %ymm14, %ymm13
1144 vunpcklps %ymm13, %ymm12, %ymm8
1145 vunpckhps %ymm13, %ymm12, %ymm9
1146
11472:
1148 vmovaps %ymm14, %ymm12
11493:
1150 vmovaps %ymm14, %ymm13
1151 vunpcklps %ymm13, %ymm12, %ymm10
1152 vunpckhps %ymm13, %ymm12, %ymm11
1153
1154 vshufps $0x44, %ymm10, %ymm8, %ymm7
1155 vshufps $0xee, %ymm10, %ymm8, %ymm4
1156 vshufps $0x44, %ymm11, %ymm9, %ymm5
1157 vshufps $0xee, %ymm11, %ymm9, %ymm6
1158 vextractf128 $0x1, %ymm7, %xmm7
1159 vextractf128 $0x1, %ymm4, %xmm8
1160 vextractf128 $0x1, %ymm5, %xmm9
1161 vextractf128 $0x1, %ymm6, %xmm10
1162
1163 jmp 8f
1164
11654:
1166 vmovaps %xmm14, %xmm12
11675:
1168 vmovaps %xmm14, %xmm13
1169 vunpcklps %xmm13, %xmm12, %xmm1
1170 vunpckhps %xmm13, %xmm12, %xmm2
1171
11726:
1173 vmovaps %xmm14, %xmm12
11747:
1175 vmovaps %xmm14, %xmm13
1176 vunpcklps %xmm13, %xmm12, %xmm3
1177 vunpckhps %xmm13, %xmm12, %xmm15
1178
1179 vshufps $0xee, %xmm3, %xmm1, %xmm11
1180 vshufps $0x44, %xmm15, %xmm2, %xmm12
1181 vshufps $0xee, %xmm15, %xmm2, %xmm13
1182
11838:
1184
1185 vmovaps %xmm14, %xmm11
1186 vmovaps %xmm14, %xmm12
1187 vmovaps %xmm14, %xmm13
1188
11890:
1190 vxorps %ymm14, %ymm14, %ymm14
1191
1192 vextractf128 $0x1, %ymm0, %xmm1
1193
1194 cmpl $8, %r12d
1195 jl 0f
1196
1197 vshufps $0xff, %xmm1, %xmm1, %xmm2
1198 cmpl $8, %r13d
1199 jl 1f
1200 vbroadcastss 28(%r11), %xmm15
1201 vmulps %xmm2, %xmm15, %xmm2
1202 vblendps $0x08, %xmm2, %xmm1, %xmm1
12031:
1204 vmulps %xmm10, %xmm2, %xmm15
1205 vsubps %xmm15, %xmm0, %xmm0
1206 vmulps %xmm13, %xmm2, %xmm15
1207 vsubps %xmm15, %xmm1, %xmm1
1208
12090:
1210 cmpl $7, %r12d
1211 jl 0f
1212
1213 vshufps $0xaa, %xmm1, %xmm1, %xmm2
1214 cmpl $7, %r13d
1215 jl 1f
1216 vbroadcastss 24(%r11), %xmm15
1217 vmulps %xmm2, %xmm15, %xmm2
1218 vblendps $0x04, %xmm2, %xmm1, %xmm1
12191:
1220 vmulps %xmm9, %xmm2, %xmm15
1221 vsubps %xmm15, %xmm0, %xmm0
1222 vmulps %xmm12, %xmm2, %xmm15
1223 vsubps %xmm15, %xmm1, %xmm1
1224
12250:
1226 cmpl $6, %r12d
1227 jl 0f
1228
1229 vshufps $0x55, %xmm1, %xmm1, %xmm2
1230 cmpl $6, %r13d
1231 jl 1f
1232 vbroadcastss 20(%r11), %xmm15
1233 vmulps %xmm2, %xmm15, %xmm2
1234 vblendps $0x02, %xmm2, %xmm1, %xmm1
12351:
1236 vmulps %xmm8, %xmm2, %xmm15
1237 vsubps %xmm15, %xmm0, %xmm0
1238 vmulps %xmm11, %xmm2, %xmm15
1239 vsubps %xmm15, %xmm1, %xmm1
1240
12410:
1242 cmpl $5, %r12d
1243 jl 0f
1244
1245 vshufps $0x00, %xmm1, %xmm1, %xmm2
1246 cmpl $5, %r13d
1247 jl 1f
1248 vbroadcastss 16(%r11), %xmm15
1249 vmulps %xmm2, %xmm15, %xmm2
1250 vblendps $0x01, %xmm2, %xmm1, %xmm1
12511:
1252 vmulps %xmm7, %xmm2, %xmm15
1253 vsubps %xmm15, %xmm0, %xmm0
1254
12550:
1256 cmpl $4, %r12d
1257 jl 0f
1258
1259 vshufps $0xff, %xmm0, %xmm0, %xmm2
1260 cmpl $4, %r13d
1261 jl 1f
1262 vbroadcastss 12(%r11), %xmm15
1263 vmulps %xmm2, %xmm15, %xmm2
1264 vblendps $0x08, %xmm2, %xmm0, %xmm0
12651:
1266 vmulps %xmm6, %xmm2, %xmm15
1267 vsubps %xmm15, %xmm0, %xmm0
1268
12690:
1270 cmpl $3, %r12d
1271 jl 0f
1272
1273 vshufps $0xaa, %xmm0, %xmm0, %xmm2
1274 cmpl $3, %r13d
1275 jl 1f
1276 vbroadcastss 8(%r11), %xmm15
1277 vmulps %xmm2, %xmm15, %xmm2
1278 vblendps $0x04, %xmm2, %xmm0, %xmm0
12791:
1280 vmulps %xmm5, %xmm2, %xmm15
1281 vsubps %xmm15, %xmm0, %xmm0
1282
12830:
1284 cmpl $2, %r12d
1285 jl 0f
1286
1287 vshufps $0x55, %xmm0, %xmm0, %xmm2
1288 cmpl $2, %r13d
1289 jl 1f
1290 vbroadcastss 4(%r11), %xmm15
1291 vmulps %xmm2, %xmm15, %xmm2
1292 vblendps $0x02, %xmm2, %xmm0, %xmm0
12931:
1294 vmulps %xmm4, %xmm2, %xmm15
1295 vsubps %xmm15, %xmm0, %xmm0
1296
12970:
1298 cmpl $1, %r12d
1299 jl 0f
1300
1301 vshufps $0x00, %xmm0, %xmm0, %xmm2
1302 cmpl $1, %r13d
1303 jl 1f
1304 vbroadcastss 0(%r11), %xmm15
1305 vmulps %xmm2, %xmm15, %xmm2
1306 vblendps $0x01, %xmm2, %xmm0, %xmm0
13071:
1308
13090:
1310
1311 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
1312
1313#if MACRO_LEVEL>=1
1314 .endm
1315#else
1316 ret
1317
1318#if defined(OS_LINUX)
1319 .size inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
1320#endif
1321#endif
1322
1323
1324
1325
1326
1327// common inner routine with file scope
1328//
1329// blend for ta==n, scale for generic alpha and beta
1330//
1331// input arguments:
1332// r10 <- alpha
1333// r11 <- beta
1334// r12 <- y
1335// ymm0 <- [z0 z1 z2 z3]_a
1336// ymm1 <- [z0 z1 z2 z3]_b
1337// ymm2 <- [z0 z1 z2 z3]_c
1338// ymm3 <- [z0 z1 z2 z3]_d
1339// ymm8 <- dirty
1340// ymm9 <- dirty
1341// ymm10 <- dirty
1342// ymm11 <- dirty
1343// ymm15 <- dirty
1344//
1345// output arguments:
1346// r10 <- alpha
1347// r11 <- beta
1348// r12 <- y
1349// ymm0 <- [z0 z1 z2 z3]
1350// ymm1 <- dirty
1351// ymm2 <- dirty
1352// ymm3 <- dirty
1353// ymm8 <- dirty
1354// ymm9 <- dirty
1355// ymm10 <- dirty
1356// ymm11 <- dirty
1357// ymm15 <- dirty
1358
1359#if MACRO_LEVEL>=1
1360 .macro INNER_BLEND_N_SCALE_AB_8_LIB8
1361#else
1362 .p2align 4,,15
1363#if defined(OS_LINUX)
1364 .type inner_blend_n_scale_ab_8_lib8, @function
1365inner_blend_n_scale_ab_8_lib8:
1366#elif defined(OS_MAC)
1367_inner_blend_n_scale_ab_8_lib8:
1368#elif defined(OS_WINDOWS)
1369 .def inner_blend_n_scale_ab_8_lib8; .scl 2; .type 32; .endef
1370inner_blend_n_scale_ab_8_lib8:
1371#endif
1372#endif
1373
1374 // reduction
1375 vaddps %ymm0, %ymm1, %ymm0
1376 vaddps %ymm2, %ymm3, %ymm2
1377 vaddps %ymm0, %ymm2, %ymm0
1378
1379 // alpha
1380 vbroadcastss 0(%r10), %ymm15
1381 vmulps %ymm0, %ymm15, %ymm0
1382
1383 // beta
1384 vbroadcastss 0(%r11), %ymm15
1385 vmovups 0(%r12), %ymm14
1386 vmulps %ymm15, %ymm14, %ymm14
1387 vaddps %ymm0, %ymm14, %ymm0
1388
1389#if MACRO_LEVEL>=1
1390 .endm
1391#else
1392 ret
1393
1394#if defined(OS_LINUX)
1395 .size inner_blend_n_scale_ab_8_lib8, .-inner_blend_n_scale_ab_8_lib8
1396#endif
1397#endif
1398
1399
1400
1401
1402
1403// common inner routine with file scope
1404//
1405// blend for ta==n, scale for alpha=-1.0 and beta=1.0
1406//
1407// input arguments:
1408// r10 <- y
1409// ymm0 <- [z0 z1 z2 z3]_a
1410// ymm1 <- [z0 z1 z2 z3]_b
1411// ymm2 <- [z0 z1 z2 z3]_c
1412// ymm3 <- [z0 z1 z2 z3]_d
1413// ymm8 <- dirty
1414// ymm9 <- dirty
1415// ymm10 <- dirty
1416// ymm11 <- dirty
1417// ymm15 <- dirty
1418//
1419// output arguments:
1420// r10 <- y
1421// ymm0 <- [z0 z1 z2 z3]
1422// ymm1 <- dirty
1423// ymm2 <- dirty
1424// ymm3 <- dirty
1425// ymm8 <- dirty
1426// ymm9 <- dirty
1427// ymm10 <- dirty
1428// ymm11 <- dirty
1429// ymm15 <- dirty
1430
1431#if MACRO_LEVEL>=1
1432 .macro INNER_BLEND_N_SCALE_M11_8_LIB8
1433#else
1434 .p2align 4,,15
1435#if defined(OS_LINUX)
1436 .type inner_blend_n_scale_m11_8_lib8, @function
1437inner_blend_n_scale_m11_8_lib8:
1438#elif defined(OS_MAC)
1439_inner_blend_n_scale_m11_8_lib8:
1440#elif defined(OS_WINDOWS)
1441 .def inner_blend_n_scale_m11_8_lib8; .scl 2; .type 32; .endef
1442inner_blend_n_scale_m11_8_lib8:
1443#endif
1444#endif
1445
1446 // reduction
1447 vaddps %ymm0, %ymm1, %ymm0
1448 vaddps %ymm2, %ymm3, %ymm2
1449 vaddps %ymm0, %ymm2, %ymm0
1450
1451 // beta
1452 vmovups 0(%r10), %ymm14
1453 vsubps %ymm0, %ymm14, %ymm0
1454
1455#if MACRO_LEVEL>=1
1456 .endm
1457#else
1458 ret
1459
1460#if defined(OS_LINUX)
1461 .size inner_blend_n_scale_m11_8_lib8, .-inner_blend_n_scale_m11_8_lib8
1462#endif
1463#endif
1464
1465
1466
1467
1468
1469// common inner routine with file scope
1470//
1471// blend for ta==t, scale for generic alpha and beta
1472//
1473// input arguments:
1474// r10 <- alpha
1475// r11 <- beta
1476// r12 <- y
1477// ymm0 <- [z0a z0b z0c z0d]
1478// ymm1 <- [z1a z1b z1c z1d]
1479// ymm2 <- [z2a z2b z2c z2d]
1480// ymm3 <- [z3a z3b z3c z3d]
1481// ymm8 <- dirty
1482// ymm9 <- dirty
1483// ymm10 <- dirty
1484// ymm11 <- dirty
1485// ymm15 <- dirty
1486//
1487// output arguments:
1488// r10 <- alpha
1489// r11 <- beta
1490// r12 <- y
1491// ymm0 <- [z0 z1 z2 z3]
1492// ymm1 <- dirty
1493// ymm2 <- dirty
1494// ymm3 <- dirty
1495// ymm8 <- dirty
1496// ymm9 <- dirty
1497// ymm10 <- dirty
1498// ymm11 <- dirty
1499// ymm15 <- dirty
1500
1501#if MACRO_LEVEL>=1
1502 .macro INNER_BLEND_T_SCALE_AB_8_LIB8
1503#else
1504 .p2align 4,,15
1505#if defined(OS_LINUX)
1506 .type inner_blend_t_scale_ab_8_lib8, @function
1507inner_blend_t_scale_ab_8_lib8:
1508#elif defined(OS_MAC)
1509_inner_blend_t_scale_ab_8_lib8:
1510#elif defined(OS_WINDOWS)
1511 .def inner_blend_t_scale_ab_8_lib8; .scl 2; .type 32; .endef
1512inner_blend_t_scale_ab_8_lib8:
1513#endif
1514#endif
1515
1516 // reduction
1517 vhaddps %ymm1, %ymm0, %ymm0
1518 vhaddps %ymm3, %ymm2, %ymm2
1519 vhaddps %ymm5, %ymm4, %ymm4
1520 vhaddps %ymm7, %ymm6, %ymm6
1521
1522 vhaddps %ymm2, %ymm0, %ymm0
1523 vhaddps %ymm6, %ymm4, %ymm4
1524
1525 vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
1526 vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
1527
1528 vaddps %ymm0, %ymm1, %ymm0
1529
1530 // alpha
1531 vbroadcastss 0(%r10), %ymm15
1532 vmulps %ymm0, %ymm15, %ymm0
1533
1534 // beta
1535 vbroadcastss 0(%r11), %ymm15
1536 vmovups 0(%r12), %ymm14
1537 vmulps %ymm15, %ymm14, %ymm14
1538 vaddps %ymm0, %ymm14, %ymm0
1539
1540#if MACRO_LEVEL>=1
1541 .endm
1542#else
1543 ret
1544
1545#if defined(OS_LINUX)
1546 .size inner_blend_t_scale_ab_8_lib8, .-inner_blend_t_scale_ab_8_lib8
1547#endif
1548#endif
1549
1550
1551
1552
1553
1554// common inner routine with file scope
1555//
1556// blend for ta==t, scale for alpha=-1.0 and beta=1.0
1557//
1558// input arguments:
1559// r10 <- y
1560// ymm0 <- [z0a z0b z0c z0d]
1561// ymm1 <- [z1a z1b z1c z1d]
1562// ymm2 <- [z2a z2b z2c z2d]
1563// ymm3 <- [z3a z3b z3c z3d]
1564// ymm8 <- dirty
1565// ymm9 <- dirty
1566// ymm10 <- dirty
1567// ymm11 <- dirty
1568// ymm15 <- dirty
1569//
1570// output arguments:
1571// r10 <- y
1572// ymm0 <- [z0 z1 z2 z3]
1573// ymm1 <- dirty
1574// ymm2 <- dirty
1575// ymm3 <- dirty
1576// ymm8 <- dirty
1577// ymm9 <- dirty
1578// ymm10 <- dirty
1579// ymm11 <- dirty
1580// ymm15 <- dirty
1581
1582#if MACRO_LEVEL>=1
1583 .macro INNER_BLEND_T_SCALE_M11_8_LIB8
1584#else
1585 .p2align 4,,15
1586#if defined(OS_LINUX)
1587 .type inner_blend_t_scale_m11_8_lib8, @function
1588inner_blend_t_scale_m11_8_lib8:
1589#elif defined(OS_MAC)
1590_inner_blend_t_scale_m11_8_lib8:
1591#elif defined(OS_WINDOWS)
1592 .def inner_blend_t_scale_m11_8_lib8; .scl 2; .type 32; .endef
1593inner_blend_t_scale_m11_8_lib8:
1594#endif
1595#endif
1596
1597 // reduction
1598 vhaddps %ymm1, %ymm0, %ymm0
1599 vhaddps %ymm3, %ymm2, %ymm2
1600 vhaddps %ymm5, %ymm4, %ymm4
1601 vhaddps %ymm7, %ymm6, %ymm6
1602
1603 vhaddps %ymm2, %ymm0, %ymm0
1604 vhaddps %ymm6, %ymm4, %ymm4
1605
1606 vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
1607 vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
1608
1609 vaddps %ymm0, %ymm1, %ymm0
1610
1611 // beta
1612 vmovups 0(%r10), %ymm14
1613 vsubps %ymm0, %ymm14, %ymm0
1614
1615#if MACRO_LEVEL>=1
1616 .endm
1617#else
1618 ret
1619
1620#if defined(OS_LINUX)
1621 .size inner_blend_t_scale_m11_8_lib8, .-inner_blend_t_scale_m11_8_lib8
1622#endif
1623#endif
1624
1625
1626
1627
1628
1629// common inner routine with file scope
1630//
1631// store
1632//
1633// input arguments:
1634// r10 <- z
1635// ymm0 <- [z0 z1 z2 z3]
1636//
1637// output arguments:
1638// r10 <- z
1639// ymm0 <- [z0 z1 z2 z3]
1640
1641#if MACRO_LEVEL>=1
1642 .macro INNER_STORE_8_LIB8
1643#else
1644 .p2align 4,,15
1645#if defined(OS_LINUX)
1646 .type inner_store_8_lib8, @function
1647inner_store_8_lib8:
1648#elif defined(OS_MAC)
1649_inner_store_8_lib8:
1650#elif defined(OS_WINDOWS)
1651 .def inner_store_8_lib8; .scl 2; .type 32; .endef
1652inner_store_8_lib8:
1653#endif
1654#endif
1655
1656 vmovups %ymm0, 0(%r10)
1657
1658#if MACRO_LEVEL>=1
1659 .endm
1660#else
1661 ret
1662
1663#if defined(OS_LINUX)
1664 .size inner_store_8_lib8, .-inner_store_8_lib8
1665#endif
1666#endif
1667
1668
1669
1670
1671
1672// common inner routine with file scope
1673//
1674// store vs
1675//
1676// input arguments:
1677// r10 <- D
1678// r11d <- km
1679// ymm0 <- [z0 z1 z2 z3]
1680// ymm14 <- dirty
1681// ymm15 <- dirty
1682//
1683// output arguments:
1684// r10 <- D
1685// r11d <- km
1686// ymm0 <- [z0 z1 z2 z3]
1687// ymm14 <- dirty
1688// ymm15 <- dirty
1689
1690#if MACRO_LEVEL>=1
1691 .macro INNER_STORE_8_VS_LIB8
1692#else
1693 .p2align 4,,15
1694#if defined(OS_LINUX)
1695 .type inner_store_8_vs_lib8, @function
1696inner_store_8_vs_lib8:
1697#elif defined(OS_MAC)
1698_inner_store_8_vs_lib8:
1699#elif defined(OS_WINDOWS)
1700 .def inner_store_8_vs_lib8; .scl 2; .type 32; .endef
1701inner_store_8_vs_lib8:
1702#endif
1703#endif
1704
1705 vcvtsi2ss %r11d, %xmm15, %xmm15
1706#if defined(OS_LINUX) | defined(OS_WINDOWS)
1707 vmovups .LC00(%rip), %ymm14
1708#elif defined(OS_MAC)
1709 vmovups LC00(%rip), %ymm14
1710#endif
1711 vshufps $0x00, %xmm15, %xmm15, %xmm15
1712 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
1713 vsubps %ymm15, %ymm14, %ymm15
1714
1715 vmaskmovps %ymm0, %ymm15, 0(%r10)
1716
1717#if MACRO_LEVEL>=1
1718 .endm
1719#else
1720 ret
1721
1722#if defined(OS_LINUX)
1723 .size inner_store_8_vs_lib8, .-inner_store_8_vs_lib8
1724#endif
1725#endif
1726
1727
1728
1729
1730
1731// common inner routine with file scope
1732//
1733// store gen
1734//
1735// input arguments:
1736// r10 <- D
1737// r11d <- k0 : start form (inc)
1738// r12d <- k1 : up to (exc)
1739// ymm0 <- [z0 z1 z2 z3]
1740// ymm14 <- dirty
1741// ymm15 <- dirty
1742//
1743// output arguments:
1744// r10 <- D
1745// r11d <- k0 : start form (inc)
1746// r12d <- k1 : up to (exc)
1747// ymm0 <- [z0 z1 z2 z3]
1748// ymm14 <- dirty
1749// ymm15 <- dirty
1750
1751#if MACRO_LEVEL>=1
1752 .macro INNER_STORE_8_GEN_LIB8
1753#else
1754 .p2align 4,,15
1755#if defined(OS_LINUX)
1756 .type inner_store_8_gen_lib8, @function
1757inner_store_8_gen_lib8:
1758#elif defined(OS_MAC)
1759_inner_store_8_gen_lib8:
1760#elif defined(OS_WINDOWS)
1761 .def inner_store_8_gen_lib8; .scl 2; .type 32; .endef
1762inner_store_8_gen_lib8:
1763#endif
1764#endif
1765
1766 // compute mask for rows
1767 vcvtsi2ss %r11d, %xmm14, %xmm14
1768 vcvtsi2ss %r12d, %xmm15, %xmm15
1769#if defined(OS_LINUX) | defined(OS_WINDOWS)
1770 vmovups .LC00(%rip), %ymm12
1771#elif defined(OS_MAC)
1772 vmovups LC00(%rip), %ymm12
1773#endif
1774 vshufps $0x00, %xmm14, %xmm14, %xmm14
1775 vshufps $0x00, %xmm15, %xmm15, %xmm15
1776 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
1777 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
1778 vsubps %ymm12, %ymm14, %ymm14
1779 vsubps %ymm15, %ymm12, %ymm15
1780 vandps %ymm14, %ymm15, %ymm15
1781
1782 vmaskmovps %ymm0, %ymm15, 0(%r10)
1783
1784#if MACRO_LEVEL>=1
1785 .endm
1786#else
1787 ret
1788
1789#if defined(OS_LINUX)
1790 .size inner_store_8_gen_lib8, .-inner_store_8_gen_lib8
1791#endif
1792#endif
1793
1794
1795
1796
1797
1798// 1 2 3 4 5 6 7
1799// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
1800
1801 .p2align 4,,15
1802#if defined(OS_LINUX)
1803 .globl kernel_sgemv_n_8_lib8
1804 .type kernel_sgemv_n_8_lib8, @function
1805kernel_sgemv_n_8_lib8:
1806#elif defined(OS_MAC)
1807 .globl _kernel_sgemv_n_8_lib8
1808_kernel_sgemv_n_8_lib8:
1809#elif defined(OS_WINDOWS)
1810 .globl kernel_sgemv_n_8_lib8
1811 .def kernel_sgemv_n_8_lib8; .scl 2; .type 32; .endef
1812kernel_sgemv_n_8_lib8:
1813#endif
1814
1815 PROLOGUE
1816
1817 // zero accumulation registers
1818
1819 vxorps %ymm0, %ymm0, %ymm0
1820 vmovaps %ymm0, %ymm1
1821 vmovaps %ymm0, %ymm2
1822 vmovaps %ymm0, %ymm3
1823
1824
1825 // call inner sgemv kernel n
1826
1827 movq ARG1, %r10 // k
1828 movq ARG3, %r11 // A
1829 movq ARG4, %r12 // x
1830
1831#if MACRO_LEVEL>=2
1832 INNER_KERNEL_GEMV_ADD_N_8_LIB8
1833#else
1834#if defined(OS_LINUX) | defined(OS_WINDOWS)
1835 call inner_kernel_gemv_add_n_8_lib8
1836#elif defined(OS_MAC)
1837 callq _inner_kernel_gemv_add_n_8_lib8
1838#endif
1839#endif
1840
1841
1842 // call inner blend n scale ab
1843
1844 movq ARG2, %r10 // alpha
1845 movq ARG5, %r11 // beta
1846 movq ARG6, %r12 // y
1847
1848#if MACRO_LEVEL>=1
1849 INNER_BLEND_N_SCALE_AB_8_LIB8
1850#else
1851#if defined(OS_LINUX) | defined(OS_WINDOWS)
1852 call inner_blend_n_scale_ab_8_lib8
1853#elif defined(OS_MAC)
1854 callq _inner_blend_n_scale_ab_8_lib8
1855#endif
1856#endif
1857
1858
1859 // store
1860
1861 movq ARG7, %r10 // z
1862
1863#if MACRO_LEVEL>=1
1864 INNER_STORE_8_LIB8
1865#else
1866#if defined(OS_LINUX) | defined(OS_WINDOWS)
1867 call inner_store_8_lib8
1868#elif defined(OS_MAC)
1869 callq _inner_store_8_lib8
1870#endif
1871#endif
1872
1873
1874 EPILOGUE
1875
1876 ret
1877
1878#if defined(OS_LINUX)
1879 .size kernel_sgemv_n_8_lib8, .-kernel_sgemv_n_8_lib8
1880#endif
1881
1882
1883
1884
1885
1886// 1 2 3 4 5 6 7 8
1887// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
1888
1889 .p2align 4,,15
1890#if defined(OS_LINUX)
1891 .globl kernel_sgemv_n_8_vs_lib8
1892 .type kernel_sgemv_n_8_vs_lib8, @function
1893kernel_sgemv_n_8_vs_lib8:
1894#elif defined(OS_MAC)
1895 .globl _kernel_sgemv_n_8_vs_lib8
1896_kernel_sgemv_n_8_vs_lib8:
1897#elif defined(OS_WINDOWS)
1898 .globl kernel_sgemv_n_8_vs_lib8
1899 .def kernel_sgemv_n_8_vs_lib8; .scl 2; .type 32; .endef
1900kernel_sgemv_n_8_vs_lib8:
1901#endif
1902
1903 PROLOGUE
1904
1905 // zero accumulation registers
1906
1907 vxorps %ymm0, %ymm0, %ymm0
1908 vmovaps %ymm0, %ymm1
1909 vmovaps %ymm0, %ymm2
1910 vmovaps %ymm0, %ymm3
1911
1912
1913 // call inner sgemv kernel n
1914
1915 movq ARG1, %r10 // k
1916 movq ARG3, %r11 // A
1917 movq ARG4, %r12 // x
1918
1919#if MACRO_LEVEL>=2
1920 INNER_KERNEL_GEMV_ADD_N_8_LIB8
1921#else
1922#if defined(OS_LINUX) | defined(OS_WINDOWS)
1923 call inner_kernel_gemv_add_n_8_lib8
1924#elif defined(OS_MAC)
1925 callq _inner_kernel_gemv_add_n_8_lib8
1926#endif
1927#endif
1928
1929
1930 // call inner blend n scale ab
1931
1932 movq ARG2, %r10 // alpha
1933 movq ARG5, %r11 // beta
1934 movq ARG6, %r12 // y
1935
1936#if MACRO_LEVEL>=1
1937 INNER_BLEND_N_SCALE_AB_8_LIB8
1938#else
1939#if defined(OS_LINUX) | defined(OS_WINDOWS)
1940 call inner_blend_n_scale_ab_8_lib8
1941#elif defined(OS_MAC)
1942 callq _inner_blend_n_scale_ab_8_lib8
1943#endif
1944#endif
1945
1946
1947 // store
1948
1949 movq ARG7, %r10 // z
1950 movq ARG8, %r11 // k1
1951
1952#if MACRO_LEVEL>=1
1953 INNER_STORE_8_VS_LIB8
1954#else
1955#if defined(OS_LINUX) | defined(OS_WINDOWS)
1956 call inner_store_8_vs_lib8
1957#elif defined(OS_MAC)
1958 callq _inner_store_8_vs_lib8
1959#endif
1960#endif
1961
1962
1963 EPILOGUE
1964
1965 ret
1966
1967#if defined(OS_LINUX)
1968 .size kernel_sgemv_n_8_vs_lib8, .-kernel_sgemv_n_8_vs_lib8
1969#endif
1970
1971
1972
1973
1974
1975// 1 2 3 4 5 6 7 8 9
1976// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq);
1977
1978 .p2align 4,,15
1979#if defined(OS_LINUX)
1980 .globl kernel_sgemv_n_8_gen_lib8
1981 .type kernel_sgemv_n_8_gen_lib8, @function
1982kernel_sgemv_n_8_gen_lib8:
1983#elif defined(OS_MAC)
1984 .globl _kernel_sgemv_n_8_gen_lib8
1985_kernel_sgemv_n_8_gen_lib8:
1986#elif defined(OS_WINDOWS)
1987 .globl kernel_sgemv_n_8_gen_lib8
1988 .def kernel_sgemv_n_8_gen_lib8; .scl 2; .type 32; .endef
1989kernel_sgemv_n_8_gen_lib8:
1990#endif
1991
1992 PROLOGUE
1993
1994 // zero accumulation registers
1995
1996 vxorps %ymm0, %ymm0, %ymm0
1997 vmovaps %ymm0, %ymm1
1998 vmovaps %ymm0, %ymm2
1999 vmovaps %ymm0, %ymm3
2000
2001
2002 // call inner sgemv kernel n
2003
2004 movq ARG1, %r10 // k
2005 movq ARG3, %r11 // A
2006 movq ARG4, %r12 // x
2007
2008#if MACRO_LEVEL>=2
2009 INNER_KERNEL_GEMV_ADD_N_8_LIB8
2010#else
2011#if defined(OS_LINUX) | defined(OS_WINDOWS)
2012 call inner_kernel_gemv_add_n_8_lib8
2013#elif defined(OS_MAC)
2014 callq _inner_kernel_gemv_add_n_8_lib8
2015#endif
2016#endif
2017
2018
2019 // call inner blend n scale ab
2020
2021 movq ARG2, %r10 // alpha
2022 movq ARG5, %r11 // beta
2023 movq ARG6, %r12 // y
2024
2025#if MACRO_LEVEL>=1
2026 INNER_BLEND_N_SCALE_AB_8_LIB8
2027#else
2028#if defined(OS_LINUX) | defined(OS_WINDOWS)
2029 call inner_blend_n_scale_ab_8_lib8
2030#elif defined(OS_MAC)
2031 callq _inner_blend_n_scale_ab_8_lib8
2032#endif
2033#endif
2034
2035
2036 // store
2037
2038 movq ARG7, %r10 // z
2039 movq ARG8, %r11 // k1
2040 movq ARG9, %r12 // k2
2041
2042#if MACRO_LEVEL>=1
2043 INNER_STORE_8_GEN_LIB8
2044#else
2045#if defined(OS_LINUX) | defined(OS_WINDOWS)
2046 call inner_store_8_gen_lib8
2047#elif defined(OS_MAC)
2048 callq _inner_store_8_gen_lib8
2049#endif
2050#endif
2051
2052
2053 EPILOGUE
2054
2055 ret
2056
2057#if defined(OS_LINUX)
2058 .size kernel_sgemv_n_8_gen_lib8, .-kernel_sgemv_n_8_gen_lib8
2059#endif
2060
2061
2062
2063
2064
2065// 1 2 3 4 5 6 7 8
2066// void kernel_sgemv_t_8_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
2067
2068 .p2align 4,,15
2069#if defined(OS_LINUX)
2070 .globl kernel_sgemv_t_8_lib8
2071 .type kernel_sgemv_t_8_lib8, @function
2072kernel_sgemv_t_8_lib8:
2073#elif defined(OS_MAC)
2074 .globl _kernel_sgemv_t_8_lib8
2075_kernel_sgemv_t_8_lib8:
2076#elif defined(OS_WINDOWS)
2077 .globl kernel_sgemv_t_8_lib8
2078 .def kernel_sgemv_t_8_lib8; .scl 2; .type 32; .endef
2079kernel_sgemv_t_8_lib8:
2080#endif
2081
2082 PROLOGUE
2083
2084 // zero accumulation registers
2085
2086 vxorps %ymm0, %ymm0, %ymm0
2087 vmovaps %ymm0, %ymm1
2088 vmovaps %ymm0, %ymm2
2089 vmovaps %ymm0, %ymm3
2090 vmovaps %ymm0, %ymm4
2091 vmovaps %ymm0, %ymm5
2092 vmovaps %ymm0, %ymm6
2093 vmovaps %ymm0, %ymm7
2094
2095
2096 // call inner sgemv kernel n
2097
2098 movq ARG1, %r10 // k
2099 movq ARG3, %r11 // A
2100 movq ARG4, %r12 // sda
2101 sall $5, %r12d // 8*sda*sizeof(float)
2102 movq ARG5, %r13 // x
2103
2104#if MACRO_LEVEL>=2
2105 INNER_KERNEL_GEMV_ADD_T_8_LIB8
2106#else
2107#if defined(OS_LINUX) | defined(OS_WINDOWS)
2108 call inner_kernel_gemv_add_t_8_lib8
2109#elif defined(OS_MAC)
2110 callq _inner_kernel_gemv_add_t_8_lib8
2111#endif
2112#endif
2113
2114
2115 // call inner blender t
2116
2117 movq ARG2, %r10 // alpha
2118 movq ARG6, %r11 // beta
2119 movq ARG7, %r12 // y
2120
2121#if MACRO_LEVEL>=1
2122 INNER_BLEND_T_SCALE_AB_8_LIB8
2123#else
2124#if defined(OS_LINUX) | defined(OS_WINDOWS)
2125 call inner_blend_t_scale_ab_8_lib8
2126#elif defined(OS_MAC)
2127 callq _inner_blend_t_scale_ab_8_lib8
2128#endif
2129#endif
2130
2131
2132 // store
2133
2134 movq ARG8, %r10 // z
2135
2136#if MACRO_LEVEL>=1
2137 INNER_STORE_8_LIB8
2138#else
2139#if defined(OS_LINUX) | defined(OS_WINDOWS)
2140 call inner_store_8_lib8
2141#elif defined(OS_MAC)
2142 callq _inner_store_8_lib8
2143#endif
2144#endif
2145
2146
2147 EPILOGUE
2148
2149 ret
2150
2151#if defined(OS_LINUX)
2152 .size kernel_sgemv_t_8_lib8, .-kernel_sgemv_t_8_lib8
2153#endif
2154
2155
2156
2157
2158
2159// 1 2 3 4 5 6 7 8 9
2160// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
2161
2162 .p2align 4,,15
2163#if defined(OS_LINUX)
2164 .globl kernel_sgemv_t_8_vs_lib8
2165 .type kernel_sgemv_t_8_vs_lib8, @function
2166kernel_sgemv_t_8_vs_lib8:
2167#elif defined(OS_MAC)
2168 .globl _kernel_sgemv_t_8_vs_lib8
2169_kernel_sgemv_t_8_vs_lib8:
2170#elif defined(OS_WINDOWS)
2171 .globl kernel_sgemv_t_8_vs_lib8
2172 .def kernel_sgemv_t_8_vs_lib8; .scl 2; .type 32; .endef
2173kernel_sgemv_t_8_vs_lib8:
2174#endif
2175
2176 PROLOGUE
2177
2178 // zero accumulation registers
2179
2180 vxorps %ymm0, %ymm0, %ymm0
2181 vmovaps %ymm0, %ymm1
2182 vmovaps %ymm0, %ymm2
2183 vmovaps %ymm0, %ymm3
2184 vmovaps %ymm0, %ymm4
2185 vmovaps %ymm0, %ymm5
2186 vmovaps %ymm0, %ymm6
2187 vmovaps %ymm0, %ymm7
2188
2189
2190 // call inner sgemv kernel n
2191
2192 movq ARG1, %r10 // k
2193 movq ARG3, %r11 // A
2194 movq ARG4, %r12 // sda
2195 sall $5, %r12d // 8*sda*sizeof(float)
2196 movq ARG5, %r13 // x
2197
2198#if MACRO_LEVEL>=2
2199 INNER_KERNEL_GEMV_ADD_T_8_LIB8
2200#else
2201#if defined(OS_LINUX) | defined(OS_WINDOWS)
2202 call inner_kernel_gemv_add_t_8_lib8
2203#elif defined(OS_MAC)
2204 callq _inner_kernel_gemv_add_t_8_lib8
2205#endif
2206#endif
2207
2208
2209 // call inner blender t
2210
2211 movq ARG2, %r10 // alpha
2212 movq ARG6, %r11 // beta
2213 movq ARG7, %r12 // y
2214
2215#if MACRO_LEVEL>=1
2216 INNER_BLEND_T_SCALE_AB_8_LIB8
2217#else
2218#if defined(OS_LINUX) | defined(OS_WINDOWS)
2219 call inner_blend_t_scale_ab_8_lib8
2220#elif defined(OS_MAC)
2221 callq _inner_blend_t_scale_ab_8_lib8
2222#endif
2223#endif
2224
2225
2226 // store
2227
2228 movq ARG8, %r10 // z
2229 movq ARG9, %r11 // k1
2230
2231#if MACRO_LEVEL>=1
2232 INNER_STORE_8_VS_LIB8
2233#else
2234#if defined(OS_LINUX) | defined(OS_WINDOWS)
2235 call inner_store_8_vs_lib8
2236#elif defined(OS_MAC)
2237 callq _inner_store_8_vs_lib8
2238#endif
2239#endif
2240
2241
2242 EPILOGUE
2243
2244 ret
2245
2246#if defined(OS_LINUX)
2247 .size kernel_sgemv_t_8_vs_lib8, .-kernel_sgemv_t_8_vs_lib8
2248#endif
2249
2250
2251
2252
2253
2254// 1 2 3 4 5 6 7 8 9 10
2255// void kernel_sgemv_t_8_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
2256
2257 .p2align 4,,15
2258#if defined(OS_LINUX)
2259 .globl kernel_sgemv_t_8_gen_lib8
2260 .type kernel_sgemv_t_8_gen_lib8, @function
2261kernel_sgemv_t_8_gen_lib8:
2262#elif defined(OS_MAC)
2263 .globl _kernel_sgemv_t_8_gen_lib8
2264_kernel_sgemv_t_8_gen_lib8:
2265#elif defined(OS_WINDOWS)
2266 .globl kernel_sgemv_t_8_gen_lib8
2267 .def kernel_sgemv_t_8_gen_lib8; .scl 2; .type 32; .endef
2268kernel_sgemv_t_8_gen_lib8:
2269#endif
2270
2271 PROLOGUE
2272
2273 // zero accumulation registers
2274
2275 vxorps %ymm0, %ymm0, %ymm0
2276 vmovaps %ymm0, %ymm1
2277 vmovaps %ymm0, %ymm2
2278 vmovaps %ymm0, %ymm3
2279 vmovaps %ymm0, %ymm4
2280 vmovaps %ymm0, %ymm5
2281 vmovaps %ymm0, %ymm6
2282 vmovaps %ymm0, %ymm7
2283
2284
2285 // call inner sgemv kernel n
2286
2287 movq ARG1, %r10 // k
2288 movq ARG4, %r11 // A
2289 movq ARG5, %r12 // sda
2290 sall $5, %r12d // 8*sda*sizeof(float)
2291 movq ARG6, %r13 // x
2292 movq ARG3, %r14 // offA
2293
2294#if MACRO_LEVEL>=2
2295 INNER_EDGE_GEMV_ADD_T_8_LIB8
2296#else
2297#if defined(OS_LINUX) | defined(OS_WINDOWS)
2298 call inner_edge_gemv_add_t_8_lib8
2299#elif defined(OS_MAC)
2300 callq _inner_edge_gemv_add_t_8_lib8
2301#endif
2302#endif
2303
2304#if MACRO_LEVEL>=2
2305 INNER_KERNEL_GEMV_ADD_T_8_LIB4
2306#else
2307#if defined(OS_LINUX) | defined(OS_WINDOWS)
2308 call inner_kernel_gemv_add_t_8_lib8
2309#elif defined(OS_MAC)
2310 callq _inner_kernel_gemv_add_t_8_lib8
2311#endif
2312#endif
2313
2314
2315 // call inner blender t
2316
2317 movq ARG2, %r10 // alpha
2318 movq ARG7, %r11 // beta
2319 movq ARG8, %r12 // y
2320
2321#if MACRO_LEVEL>=1
2322 INNER_BLEND_T_SCALE_AB_8_LIB4
2323#else
2324#if defined(OS_LINUX) | defined(OS_WINDOWS)
2325 call inner_blend_t_scale_ab_8_lib8
2326#elif defined(OS_MAC)
2327 callq _inner_blend_t_scale_ab_8_lib8
2328#endif
2329#endif
2330
2331
2332 // store
2333
2334 movq ARG9, %r10 // z
2335 movq ARG10, %r11 // km
2336
2337#if MACRO_LEVEL>=1
2338 INNER_STORE_8_VS_LIB4
2339#else
2340#if defined(OS_LINUX) | defined(OS_WINDOWS)
2341 call inner_store_8_vs_lib8
2342#elif defined(OS_MAC)
2343 callq _inner_store_8_vs_lib8
2344#endif
2345#endif
2346
2347
2348 EPILOGUE
2349
2350 ret
2351
2352#if defined(OS_LINUX)
2353 .size kernel_sgemv_t_8_gen_lib8, .-kernel_sgemv_t_8_gen_lib8
2354#endif
2355
2356
2357
2358
2359
2360// 1 2 3 4 5 6
2361// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
2362
2363 .p2align 4,,15
2364#if defined(OS_LINUX)
2365 .globl kernel_strsv_ln_inv_8_lib8
2366 .type kernel_strsv_ln_inv_8_lib8, @function
2367kernel_strsv_ln_inv_8_lib8:
2368#elif defined(OS_MAC)
2369 .globl _kernel_strsv_ln_inv_8_lib8
2370_kernel_strsv_ln_inv_8_lib8:
2371#elif defined(OS_WINDOWS)
2372 .globl kernel_strsv_ln_inv_8_lib8
2373 .def kernel_strsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
2374kernel_strsv_ln_inv_8_lib8:
2375#endif
2376
2377 PROLOGUE
2378
2379 // zero accumulation registers
2380
2381 vxorps %ymm0, %ymm0, %ymm0
2382 vmovaps %ymm0, %ymm1
2383 vmovaps %ymm0, %ymm2
2384 vmovaps %ymm0, %ymm3
2385 vmovaps %ymm0, %ymm4
2386 vmovaps %ymm0, %ymm5
2387 vmovaps %ymm0, %ymm6
2388 vmovaps %ymm0, %ymm7
2389
2390
2391 // call inner dgemv kernel n
2392
2393 movq ARG1, %r10 // k
2394 movq ARG2, %r11 // A
2395 movq ARG4, %r12 // x
2396
2397#if MACRO_LEVEL>=2
2398 INNER_KERNEL_GEMV_ADD_N_8_LIB8
2399#else
2400#if defined(OS_LINUX) | defined(OS_WINDOWS)
2401 call inner_kernel_gemv_add_n_8_lib8
2402#elif defined(OS_MAC)
2403 callq _inner_kernel_gemv_add_n_8_lib8
2404#endif
2405#endif
2406
2407 movq %r11, %r13 // A+k*sizeof(double)
2408
2409
2410 // call inner blender n
2411
2412 movq ARG5, %r10 // y
2413
2414#if MACRO_LEVEL>=1
2415 INNER_BLEND_N_SCALE_M11_8_LIB8
2416#else
2417#if defined(OS_LINUX) | defined(OS_WINDOWS)
2418 call inner_blend_n_scale_m11_8_lib8
2419#elif defined(OS_MAC)
2420 callq _inner_blend_n_scale_m11_8_lib8
2421#endif
2422#endif
2423
2424
2425 // solution
2426
2427 movq %r13, %r10 // A+k*sizeof(double)
2428 movq ARG3, %r11 // inv_diag_A
2429
2430#if MACRO_LEVEL>=1
2431 INNER_EDGE_TRSV_LN_INV_8_LIB8
2432#else
2433#if defined(OS_LINUX) | defined(OS_WINDOWS)
2434 call inner_edge_trsv_ln_inv_8_lib8
2435#elif defined(OS_MAC)
2436 callq _inner_edge_trsv_ln_inv_8_lib8
2437#endif
2438#endif
2439
2440
2441 // store
2442
2443 movq ARG6, %r10 // z
2444
2445#if MACRO_LEVEL>=1
2446 INNER_STORE_8_LIB8
2447#else
2448#if defined(OS_LINUX) | defined(OS_WINDOWS)
2449 call inner_store_8_lib8
2450#elif defined(OS_MAC)
2451 callq _inner_store_8_lib8
2452#endif
2453#endif
2454
2455
2456 EPILOGUE
2457
2458 ret
2459
2460#if defined(OS_LINUX)
2461 .size kernel_strsv_ln_inv_8_lib8, .-kernel_strsv_ln_inv_8_lib8
2462#endif
2463
2464
2465
2466
2467
2468// 1 2 3 4 5 6 7 8
2469// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
2470
2471 .p2align 4,,15
2472#if defined(OS_LINUX)
2473 .globl kernel_strsv_ln_inv_8_vs_lib8
2474 .type kernel_strsv_ln_inv_8_vs_lib8, @function
2475kernel_strsv_ln_inv_8_vs_lib8:
2476#elif defined(OS_MAC)
2477 .globl _kernel_strsv_ln_inv_8_vs_lib8
2478_kernel_strsv_ln_inv_8_vs_lib8:
2479#elif defined(OS_WINDOWS)
2480 .globl kernel_strsv_ln_inv_8_vs_lib8
2481 .def kernel_strsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
2482kernel_strsv_ln_inv_8_vs_lib8:
2483#endif
2484
2485 PROLOGUE
2486
2487 // zero accumulation registers
2488
2489 vxorps %ymm0, %ymm0, %ymm0
2490 vmovaps %ymm0, %ymm1
2491 vmovaps %ymm0, %ymm2
2492 vmovaps %ymm0, %ymm3
2493 vmovaps %ymm0, %ymm4
2494 vmovaps %ymm0, %ymm5
2495 vmovaps %ymm0, %ymm6
2496 vmovaps %ymm0, %ymm7
2497
2498
2499 // call inner dgemv kernel n
2500
2501 movq ARG1, %r10 // k
2502 movq ARG2, %r11 // A
2503 movq ARG4, %r12 // x
2504
2505#if MACRO_LEVEL>=2
2506 INNER_KERNEL_GEMV_ADD_N_8_LIB8
2507#else
2508#if defined(OS_LINUX) | defined(OS_WINDOWS)
2509 call inner_kernel_gemv_add_n_8_lib8
2510#elif defined(OS_MAC)
2511 callq _inner_kernel_gemv_add_n_8_lib8
2512#endif
2513#endif
2514
2515 movq %r11, %r13 // A+k*sizeof(double)
2516
2517
2518 // call inner blender n
2519
2520 movq ARG5, %r10 // y
2521
2522#if MACRO_LEVEL>=1
2523 INNER_BLEND_N_SCALE_M11_8_LIB8
2524#else
2525#if defined(OS_LINUX) | defined(OS_WINDOWS)
2526 call inner_blend_n_scale_m11_8_lib8
2527#elif defined(OS_MAC)
2528 callq _inner_blend_n_scale_m11_8_lib8
2529#endif
2530#endif
2531
2532
2533 // solution
2534
2535 movq %r13, %r10 // A+k*sizeof(double)
2536 movq ARG3, %r11 // inv_diag_A
2537 movq ARG8, %r12 // kn
2538
2539#if MACRO_LEVEL>=1
2540 INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
2541#else
2542#if defined(OS_LINUX) | defined(OS_WINDOWS)
2543 call inner_edge_trsv_ln_inv_8_vs_lib8
2544#elif defined(OS_MAC)
2545 callq _inner_edge_trsv_ln_inv_8_vs_lib8
2546#endif
2547#endif
2548
2549
2550 // store
2551
2552 movq ARG6, %r10 // z
2553 movq ARG7, %r11 // km
2554
2555#if MACRO_LEVEL>=1
2556 INNER_STORE_8_VS_LIB8
2557#else
2558#if defined(OS_LINUX) | defined(OS_WINDOWS)
2559 call inner_store_8_vs_lib8
2560#elif defined(OS_MAC)
2561 callq _inner_store_8_vs_lib8
2562#endif
2563#endif
2564
2565
2566 EPILOGUE
2567
2568 ret
2569
2570#if defined(OS_LINUX)
2571 .size kernel_strsv_ln_inv_8_vs_lib8, .-kernel_strsv_ln_inv_8_vs_lib8
2572#endif
2573
2574
2575
2576
2577
2578// 1 2 3 4 5 6 7
2579// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
2580
2581 .p2align 4,,15
2582#if defined(OS_LINUX)
2583 .globl kernel_strsv_lt_inv_8_lib8
2584 .type kernel_strsv_lt_inv_8_lib8, @function
2585kernel_strsv_lt_inv_8_lib8:
2586#elif defined(OS_MAC)
2587 .globl _kernel_strsv_lt_inv_8_lib8
2588_kernel_strsv_lt_inv_8_lib8:
2589#elif defined(OS_WINDOWS)
2590 .globl kernel_strsv_lt_inv_8_lib8
2591 .def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
2592kernel_strsv_lt_inv_8_lib8:
2593#endif
2594
2595 PROLOGUE
2596
2597 // zero accumulation registers
2598
2599 vxorps %ymm0, %ymm0, %ymm0
2600 vmovaps %ymm0, %ymm1
2601 vmovaps %ymm0, %ymm2
2602 vmovaps %ymm0, %ymm3
2603 vmovaps %ymm0, %ymm4
2604 vmovaps %ymm0, %ymm5
2605 vmovaps %ymm0, %ymm6
2606 vmovaps %ymm0, %ymm7
2607
2608
2609 // call inner dgemv kernel n
2610
2611 movq ARG1, %r10 // k
2612 subl $8, %r10d
2613 movq ARG2, %r11 // A
2614 movq ARG3, %r12
2615 sall $5, %r12d // 8*sda*sizeof(float)
2616 addq %r12, %r11 // A+8*sda*sizeof(float)
2617 movq ARG5, %r13 // x
2618 addq $32, %r13 // x+8
2619
2620#if MACRO_LEVEL>=2
2621 INNER_KERNEL_GEMV_ADD_T_8_LIB8
2622#else
2623#if defined(OS_LINUX) | defined(OS_WINDOWS)
2624 call inner_kernel_gemv_add_t_8_lib8
2625#elif defined(OS_MAC)
2626 callq _inner_kernel_gemv_add_t_8_lib8
2627#endif
2628#endif
2629
2630
2631 // call inner blender t
2632
2633 movq ARG6, %r10 // y
2634
2635#if MACRO_LEVEL>=1
2636 INNER_BLEND_T_SCALE_M11_8_LIB8
2637#else
2638#if defined(OS_LINUX) | defined(OS_WINDOWS)
2639 call inner_blend_t_scale_m11_8_lib8
2640#elif defined(OS_MAC)
2641 callq _inner_blend_t_scale_m11_8_lib8
2642#endif
2643#endif
2644
2645
2646 // solution
2647
2648 movq ARG2, %r10 // A
2649 movq ARG4, %r11 // inv_diag_A
2650
2651#if MACRO_LEVEL>=1
2652 INNER_EDGE_TRSV_LT_INV_8_LIB8
2653#else
2654#if defined(OS_LINUX) | defined(OS_WINDOWS)
2655 call inner_edge_trsv_lt_inv_8_lib8
2656#elif defined(OS_MAC)
2657 callq _inner_edge_trsv_lt_inv_8_lib8
2658#endif
2659#endif
2660
2661
2662 // store
2663
2664 movq ARG7, %r10 // z
2665
2666#if MACRO_LEVEL>=1
2667 INNER_STORE_8_LIB8
2668#else
2669#if defined(OS_LINUX) | defined(OS_WINDOWS)
2670 call inner_store_8_lib8
2671#elif defined(OS_MAC)
2672 callq _inner_store_8_lib8
2673#endif
2674#endif
2675
2676
2677 EPILOGUE
2678
2679 ret
2680
2681#if defined(OS_LINUX)
2682 .size kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
2683#endif
2684
2685
2686
2687
2688
2689// 1 2 3 4 5 6 7 8 9
2690// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
2691
2692 .p2align 4,,15
2693#if defined(OS_LINUX)
2694 .globl kernel_strsv_lt_inv_8_vs_lib8
2695 .type kernel_strsv_lt_inv_8_vs_lib8, @function
2696kernel_strsv_lt_inv_8_vs_lib8:
2697#elif defined(OS_MAC)
2698 .globl _kernel_strsv_lt_inv_8_vs_lib8
2699_kernel_strsv_lt_inv_8_vs_lib8:
2700#elif defined(OS_WINDOWS)
2701 .globl kernel_strsv_lt_inv_8_vs_lib8
2702 .def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
2703kernel_strsv_lt_inv_8_vs_lib8:
2704#endif
2705
2706 PROLOGUE
2707
2708 // zero accumulation registers
2709
2710 vxorps %ymm0, %ymm0, %ymm0
2711 vmovaps %ymm0, %ymm1
2712 vmovaps %ymm0, %ymm2
2713 vmovaps %ymm0, %ymm3
2714 vmovaps %ymm0, %ymm4
2715 vmovaps %ymm0, %ymm5
2716 vmovaps %ymm0, %ymm6
2717 vmovaps %ymm0, %ymm7
2718
2719
2720 // call inner dgemv kernel n
2721
2722 movq ARG1, %r10 // k
2723 subl $8, %r10d
2724 movq ARG2, %r11 // A
2725 movq ARG3, %r12
2726 sall $5, %r12d // 8*sda*sizeof(float)
2727 addq %r12, %r11 // A+8*sda*sizeof(float)
2728 movq ARG5, %r13 // x
2729 addq $32, %r13 // x+8
2730
2731#if MACRO_LEVEL>=2
2732 INNER_KERNEL_GEMV_ADD_T_8_LIB8
2733#else
2734#if defined(OS_LINUX) | defined(OS_WINDOWS)
2735 call inner_kernel_gemv_add_t_8_lib8
2736#elif defined(OS_MAC)
2737 callq _inner_kernel_gemv_add_t_8_lib8
2738#endif
2739#endif
2740
2741
2742 // call inner blender t
2743
2744 movq ARG6, %r10 // y
2745
2746#if MACRO_LEVEL>=1
2747 INNER_BLEND_T_SCALE_M11_8_LIB8
2748#else
2749#if defined(OS_LINUX) | defined(OS_WINDOWS)
2750 call inner_blend_t_scale_m11_8_lib8
2751#elif defined(OS_MAC)
2752 callq _inner_blend_t_scale_m11_8_lib8
2753#endif
2754#endif
2755
2756
2757 // solution
2758
2759 movq ARG2, %r10 // A
2760 movq ARG4, %r11 // inv_diag_A
2761 movq ARG8, %r12 // km
2762 movq ARG9, %r13 // kn
2763 movq ARG5, %r14 // x
2764
2765#if MACRO_LEVEL>=1
2766 INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
2767#else
2768#if defined(OS_LINUX) | defined(OS_WINDOWS)
2769 call inner_edge_trsv_lt_inv_8_vs_lib8
2770#elif defined(OS_MAC)
2771 callq _inner_edge_trsv_lt_inv_8_vs_lib8
2772#endif
2773#endif
2774
2775
2776 // store
2777
2778 movq ARG7, %r10 // z
2779 movq ARG9, %r11 // kn
2780
2781#if MACRO_LEVEL>=1
2782 INNER_STORE_8_VS_LIB8
2783#else
2784#if defined(OS_LINUX) | defined(OS_WINDOWS)
2785 call inner_store_8_vs_lib8
2786#elif defined(OS_MAC)
2787 callq _inner_store_8_vs_lib8
2788#endif
2789#endif
2790
2791
2792 EPILOGUE
2793
2794 ret
2795
2796#if defined(OS_LINUX)
2797 .size kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
2798#endif
2799
2800
2801
2802
2803
2804 // read-only data
2805#if defined(OS_LINUX)
2806 .section .rodata.cst32,"aM",@progbits,32
2807#elif defined(OS_MAC)
2808 .section __TEXT,__const
2809#elif defined(OS_WINDOWS)
2810 .section .rdata,"dr"
2811#endif
2812
2813#if defined(OS_LINUX)
2814 .align 32
2815.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
2816#elif defined(OS_MAC)
2817 .align 5
2818LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
2819#endif
2820 .float 0.5
2821 .float 1.5
2822 .float 2.5
2823 .float 3.5
2824 .float 4.5
2825 .float 5.5
2826 .float 6.5
2827 .float 7.5
2828
2829
2830
2831
2832#if defined(OS_LINUX)
2833 .section .note.GNU-stack,"",@progbits
2834#elif defined(OS_MAC)
2835 .subsections_via_symbols
2836#endif
2837