blob: 53d371e71133d541e06aab9a8578c4fb3bf925f8 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#if defined(OS_LINUX) | defined(OS_MAC)
30
31//#define STACKSIZE 96
32#define STACKSIZE 64
33#define ARG1 %rdi
34#define ARG2 %rsi
35#define ARG3 %rdx
36#define ARG4 %rcx
37#define ARG5 %r8
38#define ARG6 %r9
39#define ARG7 STACKSIZE + 8(%rsp)
40#define ARG8 STACKSIZE + 16(%rsp)
41#define ARG9 STACKSIZE + 24(%rsp)
42#define ARG10 STACKSIZE + 32(%rsp)
43#define ARG11 STACKSIZE + 40(%rsp)
44#define ARG12 STACKSIZE + 48(%rsp)
45#define ARG13 STACKSIZE + 56(%rsp)
46#define ARG14 STACKSIZE + 64(%rsp)
47#define ARG15 STACKSIZE + 72(%rsp)
48#define ARG16 STACKSIZE + 80(%rsp)
49#define ARG17 STACKSIZE + 88(%rsp)
50#define ARG18 STACKSIZE + 96(%rsp)
51#define PROLOGUE \
52 subq $STACKSIZE, %rsp; \
53 movq %rbx, (%rsp); \
54 movq %rbp, 8(%rsp); \
55 movq %r12, 16(%rsp); \
56 movq %r13, 24(%rsp); \
57 movq %r14, 32(%rsp); \
58 movq %r15, 40(%rsp); \
59 vzeroupper;
60#define EPILOGUE \
61 vzeroupper; \
62 movq (%rsp), %rbx; \
63 movq 8(%rsp), %rbp; \
64 movq 16(%rsp), %r12; \
65 movq 24(%rsp), %r13; \
66 movq 32(%rsp), %r14; \
67 movq 40(%rsp), %r15; \
68 addq $STACKSIZE, %rsp;
69
70#elif defined(OS_WINDOWS)
71
72#define STACKSIZE 256
73#define ARG1 %rcx
74#define ARG2 %rdx
75#define ARG3 %r8
76#define ARG4 %r9
77#define ARG5 STACKSIZE + 40(%rsp)
78#define ARG6 STACKSIZE + 48(%rsp)
79#define ARG7 STACKSIZE + 56(%rsp)
80#define ARG8 STACKSIZE + 64(%rsp)
81#define ARG9 STACKSIZE + 72(%rsp)
82#define ARG10 STACKSIZE + 80(%rsp)
83#define ARG11 STACKSIZE + 88(%rsp)
84#define ARG12 STACKSIZE + 96(%rsp)
85#define ARG13 STACKSIZE + 104(%rsp)
86#define ARG14 STACKSIZE + 112(%rsp)
87#define ARG15 STACKSIZE + 120(%rsp)
88#define ARG16 STACKSIZE + 128(%rsp)
89#define ARG17 STACKSIZE + 136(%rsp)
90#define ARG18 STACKSIZE + 144(%rsp)
91#define PROLOGUE \
92 subq $STACKSIZE, %rsp; \
93 movq %rbx, (%rsp); \
94 movq %rbp, 8(%rsp); \
95 movq %r12, 16(%rsp); \
96 movq %r13, 24(%rsp); \
97 movq %r14, 32(%rsp); \
98 movq %r15, 40(%rsp); \
99 movq %rdi, 48(%rsp); \
100 movq %rsi, 56(%rsp); \
101 vmovups %xmm6, 64(%rsp); \
102 vmovups %xmm7, 80(%rsp); \
103 vmovups %xmm8, 96(%rsp); \
104 vmovups %xmm9, 112(%rsp); \
105 vmovups %xmm10, 128(%rsp); \
106 vmovups %xmm11, 144(%rsp); \
107 vmovups %xmm12, 160(%rsp); \
108 vmovups %xmm13, 176(%rsp); \
109 vmovups %xmm14, 192(%rsp); \
110 vmovups %xmm15, 208(%rsp); \
111 vzeroupper;
112#define EPILOGUE \
113 vzeroupper; \
114 movq (%rsp), %rbx; \
115 movq 8(%rsp), %rbp; \
116 movq 16(%rsp), %r12; \
117 movq 24(%rsp), %r13; \
118 movq 32(%rsp), %r14; \
119 movq 40(%rsp), %r15; \
120 movq 48(%rsp), %rdi; \
121 movq 56(%rsp), %rsi; \
122 vmovups 64(%rsp), %xmm6; \
123 vmovups 80(%rsp), %xmm7; \
124 vmovups 96(%rsp), %xmm8; \
125 vmovups 112(%rsp), %xmm9; \
126 vmovups 128(%rsp), %xmm10; \
127 vmovups 144(%rsp), %xmm11; \
128 vmovups 160(%rsp), %xmm12; \
129 vmovups 176(%rsp), %xmm13; \
130 vmovups 192(%rsp), %xmm14; \
131 vmovups 208(%rsp), %xmm15; \
132 addq $STACKSIZE, %rsp;
133
134#else
135
136#error wrong OS
137
138#endif
139
140
141
142#if defined(OS_LINUX) | defined(OS_WINDOWS)
143 .text
144#elif defined(OS_MAC)
145 .section __TEXT,__text,regular,pure_instructions
146#endif
147
148// common inner routine with file scope
149//
150// input arguments:
151// r10d <- k
152// r11 <- A
153// r12 <- 4*sda*sizeof(double)
154// r13 <- x
155// r15 <- dirty
156// ymm0 <- [z0 z1 z2 z3]_a
157// ymm1 <- [z4 z5 z6 z7]_a
158// ymm2 <- [z0 z1 z2 z3]_b
159// ymm3 <- [z4 z5 z6 z7]_b
160// ymm8 <- dirty
161// ymm9 <- dirty
162// ymm10 <- dirty
163// ymm11 <- dirty
164// ymm12 <- dirty
165// ymm13 <- dirty
166// ymm14 <- dirty
167// ymm15 <- dirty
168
169//
170// output arguments:
171// r10d <- 0
172// r11 <- A+4*k*sizeof(double)
173// r12 <- 4*sda*sizeof(double)
174// r13 <- x+k*sizeof(double)
175// r15 <- dirty
176// ymm0 <- [z0 z1 z2 z3]_a
177// ymm1 <- [z4 z5 z6 z7]_a
178// ymm2 <- [z0 z1 z2 z3]_b
179// ymm3 <- [z4 z5 z6 z7]_b
180// ymm8 <- dirty
181// ymm9 <- dirty
182// ymm10 <- dirty
183// ymm11 <- dirty
184// ymm12 <- dirty
185// ymm13 <- dirty
186// ymm14 <- dirty
187// ymm15 <- dirty
188
189#if MACRO_LEVEL>=2
190 .macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
191#else
192 .p2align 4,,15
193#if defined(OS_LINUX)
194 .type inner_kernel_dgemv_add_n_8_lib4, @function
195inner_kernel_dgemv_add_n_8_lib4:
196#elif defined(OS_MAC)
197_inner_kernel_dgemv_add_n_8_lib4:
198#elif defined(OS_WINDOWS)
199 .def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
200inner_kernel_dgemv_add_n_8_lib4:
201#endif
202#endif
203
204 cmpl $0, %r10d
205 jle 2f // return
206
207 movq %r11, %r15 // A1 <- A0
208 addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
209
210 cmpl $4, %r10d
211
212 prefetcht0 0(%r11) // software prefetch
213 prefetcht0 0(%r15) // software prefetch
214 prefetcht0 64(%r11) // software prefetch
215 prefetcht0 64(%r15) // software prefetch
216
217 jl 0f // clean-up loop
218
219 // main loop
220 .p2align 3
2211: // main loop
222
223 prefetcht0 128(%r11) // software prefetch
224 prefetcht0 128(%r15) // software prefetch
225
226 vbroadcastsd 0(%r13), %ymm12
227 vmovapd 0(%r11), %ymm8
228 vmulpd %ymm8, %ymm12, %ymm15
229 vaddpd %ymm0, %ymm15, %ymm0
230 vmovapd 0(%r15), %ymm8
231 vmulpd %ymm8, %ymm12, %ymm15
232 vaddpd %ymm1, %ymm15, %ymm1
233
234 subl $4, %r10d
235
236 vbroadcastsd 8(%r13), %ymm12
237 vmovapd 32(%r11), %ymm8
238 vmulpd %ymm8, %ymm12, %ymm15
239 vaddpd %ymm2, %ymm15, %ymm2
240 vmovapd 32(%r15), %ymm8
241 vmulpd %ymm8, %ymm12, %ymm15
242 vaddpd %ymm3, %ymm15, %ymm3
243
244 prefetcht0 192(%r11) // software prefetch
245 prefetcht0 192(%r15) // software prefetch
246
247 vbroadcastsd 16(%r13), %ymm12
248 vmovapd 64(%r11), %ymm8
249 vmulpd %ymm8, %ymm12, %ymm15
250 vaddpd %ymm0, %ymm15, %ymm0
251 vmovapd 64(%r15), %ymm8
252 vmulpd %ymm8, %ymm12, %ymm15
253 vaddpd %ymm1, %ymm15, %ymm1
254
255 vbroadcastsd 24(%r13), %ymm12
256 addq $32, %r13 // x+4
257 vmovapd 96(%r11), %ymm8
258 addq $128, %r11 // A0+4*bs
259 vmulpd %ymm8, %ymm12, %ymm15
260 vaddpd %ymm2, %ymm15, %ymm2
261 vmovapd 96(%r15), %ymm8
262 addq $128, %r15 // A1+4*bs
263 vmulpd %ymm8, %ymm12, %ymm15
264 vaddpd %ymm3, %ymm15, %ymm3
265
266 cmpl $3, %r10d
267
268 jg 1b // main loop
269
270
271 // consider clean-up
272 cmpl $0, %r10d
273 jle 2f // return
274
2750: // clean-up
276
277 vbroadcastsd 0(%r13), %ymm12
278 vmovapd 0(%r11), %ymm8
279 vmulpd %ymm8, %ymm12, %ymm15
280 vaddpd %ymm0, %ymm15, %ymm0
281 vmovapd 0(%r15), %ymm8
282 vmulpd %ymm8, %ymm12, %ymm15
283 vaddpd %ymm1, %ymm15, %ymm1
284
285 addq $32, %r11
286 addq $32, %r15
287 addq $8, %r13
288
289 subl $1, %r10d
290 cmpl $0, %r10d
291
292 jg 0b // clean
293
2942: // return
295
296#if MACRO_LEVEL>=2
297 .endm
298#else
299 ret
300
301#if defined(OS_LINUX)
302 .size inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
303#endif
304#endif
305
306
307
308
309
310// common inner routine with file scope
311//
312// input arguments:
313// r10d <- k
314// r11 <- A
315// r12 <- bs*sda*sizeof(double) = 32*sda
316// r13 <- x
317// ymm0 <- [z0a z0b z0c z0d]
318// ymm1 <- [z1a z1b z1c z1d]
319// ymm2 <- [z2a z2b z2c z2d]
320// ymm3 <- [z3a z3b z3c z3d]
321// ymm4 <- [z4a z4b z4c z4d]
322// ymm5 <- [z5a z5b z5c z5d]
323// ymm6 <- [z6a z6b z6c z6d]
324// ymm7 <- [z7a z7b z7c z7d]
325// ymm8 <- dirty
326// ymm9 <- dirty
327// ymm10 <- dirty
328// ymm11 <- dirty
329// ymm12 <- dirty
330// ymm13 <- dirty
331// ymm14 <- dirty
332// ymm15 <- dirty
333
334//
335// output arguments:
336// r10d <- 0
337// r11 <- A+4*k*sizeof(double)
338// r12 <- bs*sda*sizeof(double) = 32*sda
339// r13 <- x+k*sizeof(double)
340// r14 <- dirty
341// ymm0 <- [z0a z0b z0c z0d]
342// ymm1 <- [z1a z1b z1c z1d]
343// ymm2 <- [z2a z2b z2c z2d]
344// ymm3 <- [z3a z3b z3c z3d]
345// ymm4 <- [z4a z4b z4c z4d]
346// ymm5 <- [z5a z5b z5c z5d]
347// ymm6 <- [z6a z6b z6c z6d]
348// ymm7 <- [z7a z7b z7c z7d]
349// ymm8 <- dirty
350// ymm9 <- dirty
351// ymm10 <- dirty
352// ymm11 <- dirty
353// ymm12 <- dirty
354// ymm13 <- dirty
355// ymm14 <- dirty
356// ymm15 <- dirty
357
358#if MACRO_LEVEL>=2
359 .macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
360#else
361 .p2align 4,,15
362#if defined(OS_LINUX)
363 .type inner_kernel_dgemv_add_t_8_lib4, @function
364inner_kernel_dgemv_add_t_8_lib4:
365#elif defined(OS_MAC)
366_inner_kernel_dgemv_add_t_8_lib4:
367#elif defined(OS_WINDOWS)
368 .def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
369inner_kernel_dgemv_add_t_8_lib4:
370#endif
371#endif
372
373 cmpl $0, %r10d
374 jle 2f // return
375
376 cmpl $4, %r10d
377
378 prefetcht0 0(%r11) // software prefetch
379 prefetcht0 64(%r11) // software prefetch
380 prefetcht0 128(%r11) // software prefetch
381 prefetcht0 192(%r11) // software prefetch
382
383 jl 0f // clean-up loop
384
385 movq %r11, %r14
386 addq %r12, %r14 // A+bs*sda
387
388 // main loop
389 .p2align 3
3901: // main loop
391
392 prefetcht0 0(%r14) // software prefetch
393
394 vmovupd 0(%r13), %ymm12
395 addq $32, %r13 // x+4
396
397 vmovapd 0(%r11), %ymm8
398 vmulpd %ymm8, %ymm12, %ymm15
399 vaddpd %ymm0, %ymm15, %ymm0
400
401 subl $4, %r10d
402
403 vmovapd 32(%r11), %ymm8
404 vmulpd %ymm8, %ymm12, %ymm15
405 vaddpd %ymm1, %ymm15, %ymm1
406
407 prefetcht0 64(%r14) // software prefetch
408
409 vmovapd 64(%r11), %ymm8
410 vmulpd %ymm8, %ymm12, %ymm15
411 vaddpd %ymm2, %ymm15, %ymm2
412
413 vmovapd 96(%r11), %ymm8
414 vmulpd %ymm8, %ymm12, %ymm15
415 vaddpd %ymm3, %ymm15, %ymm3
416
417 prefetcht0 128(%r14) // software prefetch
418
419 vmovapd 128(%r11), %ymm8
420 vmulpd %ymm8, %ymm12, %ymm15
421 vaddpd %ymm4, %ymm15, %ymm4
422
423 vmovapd 160(%r11), %ymm8
424 vmulpd %ymm8, %ymm12, %ymm15
425 vaddpd %ymm5, %ymm15, %ymm5
426
427 prefetcht0 192(%r14) // software prefetch
428
429 vmovapd 192(%r11), %ymm8
430 vmulpd %ymm8, %ymm12, %ymm15
431 vaddpd %ymm6, %ymm15, %ymm6
432
433 vmovapd 224(%r11), %ymm8
434 vmulpd %ymm8, %ymm12, %ymm15
435 vaddpd %ymm7, %ymm15, %ymm7
436
437// addq %r12, %r11 // A+bs*sda
438 movq %r14, %r11 // A+bs*sda
439 addq %r12, %r14 // A+bs*sda+bs*sda
440
441 cmpl $3, %r10d
442
443 jg 1b // main loop
444
445
446 // consider clean-up
447 cmpl $0, %r10d
448 jle 2f // return
449
4500: // clean-up
451
452 vcvtsi2sd %r10d, %xmm14, %xmm14
453#if defined(OS_LINUX) | defined(OS_WINDOWS)
454 vmovupd .LC02(%rip), %ymm13
455#elif defined(OS_MAC)
456 vmovupd LC02(%rip), %ymm13
457#endif
458 vmovddup %xmm14, %xmm14
459 vinsertf128 $1, %xmm14, %ymm14, %ymm14
460 vsubpd %ymm14, %ymm13, %ymm14
461
462 vmaskmovpd 0(%r13), %ymm14, %ymm12
463
464 vmovapd 0(%r11), %ymm8
465 vmulpd %ymm8, %ymm12, %ymm15
466 vaddpd %ymm0, %ymm15, %ymm0
467
468 vmovapd 32(%r11), %ymm8
469 vmulpd %ymm8, %ymm12, %ymm15
470 vaddpd %ymm1, %ymm15, %ymm1
471
472 vmovapd 64(%r11), %ymm8
473 vmulpd %ymm8, %ymm12, %ymm15
474 vaddpd %ymm2, %ymm15, %ymm2
475
476 vmovapd 96(%r11), %ymm8
477 vmulpd %ymm8, %ymm12, %ymm15
478 vaddpd %ymm3, %ymm15, %ymm3
479
480 vmovapd 128(%r11), %ymm8
481 vmulpd %ymm8, %ymm12, %ymm15
482 vaddpd %ymm4, %ymm15, %ymm4
483
484 vmovapd 160(%r11), %ymm8
485 vmulpd %ymm8, %ymm12, %ymm15
486 vaddpd %ymm5, %ymm15, %ymm5
487
488 vmovapd 192(%r11), %ymm8
489 vmulpd %ymm8, %ymm12, %ymm15
490 vaddpd %ymm6, %ymm15, %ymm6
491
492 vmovapd 224(%r11), %ymm8
493 vmulpd %ymm8, %ymm12, %ymm15
494 vaddpd %ymm7, %ymm15, %ymm7
495
496 sall $3, %r10d
497// movslq %r10d, %r10
498 addq %r10, %r11
499 addq %r10, %r13
500 xorl %r10d, %r10d
501
502
5032: // return
504
505#if MACRO_LEVEL>=2
506 .endm
507#else
508 ret
509
510#if defined(OS_LINUX)
511 .size inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
512#endif
513#endif
514
515
516
517
518
519// common inner routine with file scope
520//
521// input arguments:
522// r10d <- k
523// r11 <- A
524// r12 <- 4*sda*sizeof(double)
525// r13 <- x
526// r15 <- dirty
527// ymm0 <- [z0 z1 z2 z3]_a
528// ymm1 <- [z4 z5 z6 z7]_a
529// ymm2 <- [z0 z1 z2 z3]_b
530// ymm3 <- [z4 z5 z6 z7]_b
531// ymm8 <- dirty
532// ymm9 <- dirty
533// ymm10 <- dirty
534// ymm11 <- dirty
535// ymm12 <- dirty
536// ymm13 <- dirty
537// ymm14 <- dirty
538// ymm15 <- dirty
539
540//
541// output arguments:
542// r10d <- k-4
543// r11 <- A+4*4*sizeof(double)
544// r12 <- 4*sda*sizeof(double)
545// r13 <- x+4*sizeof(double)
546// r15 <- dirty
547// ymm0 <- [z0 z1 z2 z3]_a
548// ymm1 <- [z4 z5 z6 z7]_a
549// ymm2 <- [z0 z1 z2 z3]_b
550// ymm3 <- [z4 z5 z6 z7]_b
551// ymm8 <- dirty
552// ymm9 <- dirty
553// ymm10 <- dirty
554// ymm11 <- dirty
555// ymm12 <- dirty
556// ymm13 <- dirty
557// ymm14 <- dirty
558// ymm15 <- dirty
559
560#if MACRO_LEVEL>=1
561 .macro INNER_EDGE_DTRMV_UN_8_LIB4
562#else
563 .p2align 4,,15
564#if defined(OS_LINUX)
565 .type inner_edge_dtrmv_un_8_lib4, @function
566inner_edge_dtrmv_un_8_lib4:
567#elif defined(OS_MAC)
568_inner_edge_dtrmv_un_8_lib4:
569#elif defined(OS_WINDOWS)
570 .def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
571inner_edge_dtrmv_un_8_lib4:
572#endif
573#endif
574
575 movq %r11, %r15 // A1 <- A0
576 addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
577
578 vxorpd %ymm14, %ymm14, %ymm14
579
580 // first 4 columns
581 vmovapd 0(%r11), %ymm8
582 vblendpd $0x1, %ymm8, %ymm14, %ymm8
583 vbroadcastsd 0(%r13), %ymm12
584 vmulpd %ymm8, %ymm12, %ymm15
585 vaddpd %ymm0, %ymm15, %ymm0
586
587 subl $4, %r10d
588
589 vmovapd 32(%r11), %ymm8
590 vblendpd $0x3, %ymm8, %ymm14, %ymm8
591 vbroadcastsd 8(%r13), %ymm12
592 vmulpd %ymm8, %ymm12, %ymm15
593 vaddpd %ymm2, %ymm15, %ymm2
594
595 vmovapd 64(%r11), %ymm8
596 vblendpd $0x7, %ymm8, %ymm14, %ymm8
597 vbroadcastsd 16(%r13), %ymm12
598 vmulpd %ymm8, %ymm12, %ymm15
599 vaddpd %ymm0, %ymm15, %ymm0
600
601 vmovapd 96(%r11), %ymm8
602 vbroadcastsd 24(%r13), %ymm12
603 vmulpd %ymm8, %ymm12, %ymm15
604 vaddpd %ymm2, %ymm15, %ymm2
605
606 addq $128, %r11
607 addq $128, %r15
608 addq $32, %r13
609
610
611
612 // last 4 columns
613 vbroadcastsd 0(%r13), %ymm12
614 vmovapd 0(%r11), %ymm8
615 vmulpd %ymm8, %ymm12, %ymm15
616 vaddpd %ymm0, %ymm15, %ymm0
617 vmovapd 0(%r15), %ymm8
618 vblendpd $0x1, %ymm8, %ymm14, %ymm8
619 vmulpd %ymm8, %ymm12, %ymm15
620 vaddpd %ymm1, %ymm15, %ymm1
621
622 subl $4, %r10d
623
624 vbroadcastsd 8(%r13), %ymm12
625 vmovapd 32(%r11), %ymm8
626 vmulpd %ymm8, %ymm12, %ymm15
627 vaddpd %ymm2, %ymm15, %ymm2
628 vmovapd 32(%r15), %ymm8
629 vblendpd $0x3, %ymm8, %ymm14, %ymm8
630 vmulpd %ymm8, %ymm12, %ymm15
631 vaddpd %ymm3, %ymm15, %ymm3
632
633 vbroadcastsd 16(%r13), %ymm12
634 vmovapd 64(%r11), %ymm8
635 vmulpd %ymm8, %ymm12, %ymm15
636 vaddpd %ymm0, %ymm15, %ymm0
637 vmovapd 64(%r15), %ymm8
638 vblendpd $0x7, %ymm8, %ymm14, %ymm8
639 vmulpd %ymm8, %ymm12, %ymm15
640 vaddpd %ymm1, %ymm15, %ymm1
641
642 vbroadcastsd 24(%r13), %ymm12
643 vmovapd 96(%r11), %ymm8
644 vmulpd %ymm8, %ymm12, %ymm15
645 vaddpd %ymm2, %ymm15, %ymm2
646 vmovapd 96(%r15), %ymm8
647 vmulpd %ymm8, %ymm12, %ymm15
648 vaddpd %ymm3, %ymm15, %ymm3
649
650 addq $128, %r11
651 addq $128, %r15
652 addq $32, %r13
653
654
655#if MACRO_LEVEL>=1
656 .endm
657#else
658 ret
659
660#if defined(OS_LINUX)
661 .size inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
662#endif
663#endif
664
665
666
667
668
669// common inner routine with file scope
670//
671// blend for ta==n
672//
673// input arguments:
674// ymm0 <- [z0 z1 z2 z3]_a
675// ymm1 <- [z4 z5 z6 z7]_a
676// ymm2 <- [z0 z1 z2 z3]_b
677// ymm3 <- [z4 z5 z6 z7]_b
678// ymm8 <- dirty
679// ymm9 <- dirty
680// ymm10 <- dirty
681// ymm11 <- dirty
682// ymm15 <- dirty
683//
684// output arguments:
685// ymm0 <- [z0 z1 z2 z3]
686// ymm1 <- [z4 z5 z6 z7]
687// ymm2 <- dirty
688// ymm3 <- dirty
689// ymm8 <- dirty
690// ymm9 <- dirty
691// ymm10 <- dirty
692// ymm11 <- dirty
693// ymm15 <- dirty
694
695#if MACRO_LEVEL>=1
696 .macro INNER_BLEND_N_8_LIB4
697#else
698 .p2align 4,,15
699#if defined(OS_LINUX)
700 .type inner_blend_n_8_lib4, @function
701inner_blend_n_8_lib4:
702#elif defined(OS_MAC)
703_inner_blend_n_8_lib4:
704#elif defined(OS_WINDOWS)
705 .def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
706inner_blend_n_8_lib4:
707#endif
708#endif
709
710 // reduction
711 vaddpd %ymm0, %ymm2, %ymm0
712 vaddpd %ymm1, %ymm3, %ymm1
713
714#if MACRO_LEVEL>=1
715 .endm
716#else
717 ret
718
719#if defined(OS_LINUX)
720 .size inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
721#endif
722#endif
723
724
725
726
727
728// common inner routine with file scope
729//
730// blend for ta==t
731//
732// input arguments:
733// ymm0 <- [z0a z0b z0c z0d]
734// ymm1 <- [z1a z1b z1c z1d]
735// ymm2 <- [z2a z2b z2c z2d]
736// ymm3 <- [z3a z3b z3c z3d]
737// ymm4 <- [z4a z4b z4c z4d]
738// ymm5 <- [z5a z5b z5c z5d]
739// ymm6 <- [z6a z6b z6c z6d]
740// ymm7 <- [z7a z7b z7c z7d]
741// ymm8 <- dirty
742// ymm9 <- dirty
743// ymm10 <- dirty
744// ymm11 <- dirty
745// ymm15 <- dirty
746//
747// output arguments:
748// ymm0 <- [z0 z1 z2 z3]
749// ymm1 <- [z4 z5 z6 z7]
750// ymm2 <- dirty
751// ymm3 <- dirty
752// ymm8 <- dirty
753// ymm9 <- dirty
754// ymm10 <- dirty
755// ymm11 <- dirty
756// ymm15 <- dirty
757
758#if MACRO_LEVEL>=1
759 .macro INNER_BLEND_T_8_LIB4
760#else
761 .p2align 4,,15
762#if defined(OS_LINUX)
763 .type inner_blend_t_8_lib4, @function
764inner_blend_t_8_lib4:
765#elif defined(OS_MAC)
766_inner_blend_t_8_lib4:
767#elif defined(OS_WINDOWS)
768 .def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
769inner_blend_t_8_lib4:
770#endif
771#endif
772
773 // reduction
774 vhaddpd %ymm1, %ymm0, %ymm0
775 vhaddpd %ymm5, %ymm4, %ymm4
776 vhaddpd %ymm3, %ymm2, %ymm2
777 vhaddpd %ymm7, %ymm6, %ymm6
778 vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
779 vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
780 vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
781 vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
782 vaddpd %ymm0, %ymm3, %ymm0
783 vaddpd %ymm4, %ymm5, %ymm1
784
785#if MACRO_LEVEL>=1
786 .endm
787#else
788 ret
789
790#if defined(OS_LINUX)
791 .size inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
792#endif
793#endif
794
795
796
797
798
799// common inner routine with file scope
800//
801// blend for ta==n, scale for generic alpha and beta
802//
803// input arguments:
804// r10 <- alpha
805// r11 <- beta
806// r12 <- y
807// ymm0 <- [z0 z1 z2 z3]_a
808// ymm1 <- [z4 z5 z6 z7]_a
809// ymm2 <- [z0 z1 z2 z3]_b
810// ymm3 <- [z4 z5 z6 z7]_b
811// ymm8 <- dirty
812// ymm9 <- dirty
813// ymm10 <- dirty
814// ymm11 <- dirty
815// ymm15 <- dirty
816//
817// output arguments:
818// r10 <- alpha
819// r11 <- beta
820// r12 <- y
821// ymm0 <- [z0 z1 z2 z3]
822// ymm1 <- [z4 z5 z6 z7]
823// ymm2 <- dirty
824// ymm3 <- dirty
825// ymm8 <- dirty
826// ymm9 <- dirty
827// ymm10 <- dirty
828// ymm11 <- dirty
829// ymm15 <- dirty
830
831#if MACRO_LEVEL>=1
832 .macro INNER_BLEND_N_SCALE_AB_8_LIB4
833#else
834 .p2align 4,,15
835#if defined(OS_LINUX)
836 .type inner_blend_n_scale_ab_8_lib4, @function
837inner_blend_n_scale_ab_8_lib4:
838#elif defined(OS_MAC)
839_inner_blend_n_scale_ab_8_lib4:
840#elif defined(OS_WINDOWS)
841 .def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
842inner_blend_n_scale_ab_8_lib4:
843#endif
844#endif
845
846 // reduction
847 vaddpd %ymm0, %ymm2, %ymm0
848 vaddpd %ymm1, %ymm3, %ymm1
849
850 // alpha
851 vbroadcastsd 0(%r10), %ymm15
852 vmulpd %ymm0, %ymm15, %ymm0
853 vmulpd %ymm1, %ymm15, %ymm1
854
855 // beta
856 vbroadcastsd 0(%r11), %ymm15
857 vmovupd 0(%r12), %ymm14
858 vmulpd %ymm15, %ymm14, %ymm14
859 vaddpd %ymm0, %ymm14, %ymm0
860 vmovupd 32(%r12), %ymm14
861 vmulpd %ymm15, %ymm14, %ymm14
862 vaddpd %ymm1, %ymm14, %ymm1
863
864#if MACRO_LEVEL>=1
865 .endm
866#else
867 ret
868
869#if defined(OS_LINUX)
870 .size inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
871#endif
872#endif
873
874
875
876
877
878// common inner routine with file scope
879//
880// blend for ta==t, scale for generic alpha and beta
881//
882// input arguments:
883// r10 <- alpha
884// r11 <- beta
885// r12 <- y
886// ymm0 <- [z0a z0b z0c z0d]
887// ymm1 <- [z1a z1b z1c z1d]
888// ymm2 <- [z2a z2b z2c z2d]
889// ymm3 <- [z3a z3b z3c z3d]
890// ymm4 <- [z4a z4b z4c z4d]
891// ymm5 <- [z5a z5b z5c z5d]
892// ymm6 <- [z6a z6b z6c z6d]
893// ymm7 <- [z7a z7b z7c z7d]
894// ymm8 <- dirty
895// ymm9 <- dirty
896// ymm10 <- dirty
897// ymm11 <- dirty
898// ymm15 <- dirty
899//
900// output arguments:
901// r10 <- alpha
902// r11 <- beta
903// r12 <- y
904// ymm0 <- [z0 z1 z2 z3]
905// ymm1 <- [z4 z5 z6 z7]
906// ymm2 <- dirty
907// ymm3 <- dirty
908// ymm8 <- dirty
909// ymm9 <- dirty
910// ymm10 <- dirty
911// ymm11 <- dirty
912// ymm15 <- dirty
913
914#if MACRO_LEVEL>=1
915 .macro INNER_BLEND_T_SCALE_AB_8_LIB4
916#else
917 .p2align 4,,15
918#if defined(OS_LINUX)
919 .type inner_blend_t_scale_ab_8_lib4, @function
920inner_blend_t_scale_ab_8_lib4:
921#elif defined(OS_MAC)
922_inner_blend_t_scale_ab_8_lib4:
923#elif defined(OS_WINDOWS)
924 .def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
925inner_blend_t_scale_ab_8_lib4:
926#endif
927#endif
928
929 // reduction
930 vhaddpd %ymm1, %ymm0, %ymm0
931 vhaddpd %ymm5, %ymm4, %ymm4
932 vhaddpd %ymm3, %ymm2, %ymm2
933 vhaddpd %ymm7, %ymm6, %ymm6
934 vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
935 vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
936 vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
937 vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
938 vaddpd %ymm0, %ymm3, %ymm0
939 vaddpd %ymm4, %ymm5, %ymm1
940
941 // alpha
942 vbroadcastsd 0(%r10), %ymm15
943 vmulpd %ymm0, %ymm15, %ymm0
944 vmulpd %ymm1, %ymm15, %ymm1
945
946 // beta
947 vbroadcastsd 0(%r11), %ymm15
948 vmovupd 0(%r12), %ymm14
949 vmulpd %ymm15, %ymm14, %ymm14
950 vaddpd %ymm0, %ymm14, %ymm0
951 vmovupd 32(%r12), %ymm14
952 vmulpd %ymm15, %ymm14, %ymm14
953 vaddpd %ymm1, %ymm14, %ymm1
954
955#if MACRO_LEVEL>=1
956 .endm
957#else
958 ret
959
960#if defined(OS_LINUX)
961 .size inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
962#endif
963#endif
964
965
966
967
968
969// common inner routine with file scope
970//
971// blender for ta==n
972//
973// input arguments:
974// r10d <- alg
975// r11 <- y
976// ymm0 <- [z0 z1 z2 z3]_a
977// ymm1 <- [z4 z5 z6 z7]_a
978// ymm2 <- [z0 z1 z2 z3]_b
979// ymm3 <- [z4 z5 z6 z7]_b
980// ymm8 <- dirty
981// ymm9 <- dirty
982// ymm10 <- dirty
983// ymm11 <- dirty
984// ymm15 <- dirty
985//
986// output arguments:
987// r10d <- alg
988// r11 <- y
989// ymm0 <- [z0 z1 z2 z3]
990// ymm1 <- [z4 z5 z6 z7]
991// ymm2 <- dirty
992// ymm3 <- dirty
993// ymm8 <- dirty
994// ymm9 <- dirty
995// ymm10 <- dirty
996// ymm11 <- dirty
997// ymm15 <- dirty
998
999#if MACRO_LEVEL>=1
1000 .macro INNER_BLENDER_N_8_LIB4
1001#else
1002 .p2align 4,,15
1003#if defined(OS_LINUX)
1004 .type inner_blender_n_8_lib4, @function
1005inner_blender_n_8_lib4:
1006#elif defined(OS_MAC)
1007_inner_blender_n_8_lib4:
1008#elif defined(OS_WINDOWS)
1009 .def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
1010inner_blender_n_8_lib4:
1011#endif
1012#endif
1013
1014 // reduction
1015 vaddpd %ymm0, %ymm2, %ymm0
1016 vaddpd %ymm1, %ymm3, %ymm1
1017
1018 cmpl $0, %r10d // alg
1019 je 0f // return
1020
1021 cmpl $1, %r10d // alg
1022 jne 1f // alg==-1
1023
1024 // alg==1
1025 vmovupd 0(%r11), %ymm15
1026 vaddpd %ymm0, %ymm15, %ymm0
1027 vmovupd 32(%r11), %ymm15
1028 vaddpd %ymm1, %ymm15, %ymm1
1029
1030 jmp 0f // return
1031
10321:
1033
1034 // alg==-1
1035 vmovupd 0(%r11), %ymm15
1036 vsubpd %ymm0, %ymm15, %ymm0
1037 vmovupd 32(%r11), %ymm15
1038 vsubpd %ymm1, %ymm15, %ymm1
1039
10400: // return
1041
1042#if MACRO_LEVEL>=1
1043 .endm
1044#else
1045 ret
1046
1047#if defined(OS_LINUX)
1048 .size inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
1049#endif
1050#endif
1051
1052
1053
1054
1055
1056// common inner routine with file scope
1057//
1058// blender for ta==t
1059//
1060// input arguments:
1061// r10d <- alg
1062// r11 <- y
1063// ymm0 <- [z0a z0b z0c z0d]
1064// ymm1 <- [z1a z1b z1c z1d]
1065// ymm2 <- [z2a z2b z2c z2d]
1066// ymm3 <- [z3a z3b z3c z3d]
1067// ymm4 <- [z4a z4b z4c z4d]
1068// ymm5 <- [z5a z5b z5c z5d]
1069// ymm6 <- [z6a z6b z6c z6d]
1070// ymm7 <- [z7a z7b z7c z7d]
1071// ymm8 <- dirty
1072// ymm9 <- dirty
1073// ymm10 <- dirty
1074// ymm11 <- dirty
1075// ymm15 <- dirty
1076//
1077// output arguments:
1078// r10d <- alg
1079// r11 <- y
1080// ymm0 <- [z0 z1 z2 z3]
1081// ymm1 <- [z4 z5 z6 z7]
1082// ymm1 <- dirty
1083// ymm2 <- dirty
1084// ymm3 <- dirty
1085// ymm8 <- dirty
1086// ymm9 <- dirty
1087// ymm10 <- dirty
1088// ymm11 <- dirty
1089// ymm15 <- dirty
1090
1091#if MACRO_LEVEL>=1
1092 .macro INNER_BLENDER_T_8_LIB4
1093#else
1094 .p2align 4,,15
1095#if defined(OS_LINUX)
1096 .type inner_blender_t_8_lib4, @function
1097inner_blender_t_8_lib4:
1098#elif defined(OS_MAC)
1099_inner_blender_t_8_lib4:
1100#elif defined(OS_WINDOWS)
1101 .def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
1102inner_blender_t_8_lib4:
1103#endif
1104#endif
1105
1106 // reduction
1107 vhaddpd %ymm1, %ymm0, %ymm0
1108 vhaddpd %ymm5, %ymm4, %ymm4
1109 vhaddpd %ymm3, %ymm2, %ymm2
1110 vhaddpd %ymm7, %ymm6, %ymm6
1111 vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
1112 vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
1113 vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
1114 vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
1115 vaddpd %ymm0, %ymm3, %ymm0
1116 vaddpd %ymm4, %ymm5, %ymm1
1117
1118 cmpl $0, %r10d // alg
1119 je 0f // return
1120
1121 cmpl $1, %r10d // alg
1122 jne 1f // alg==-1
1123
1124 // alg==1
1125 vmovupd 0(%r11), %ymm15
1126 vaddpd %ymm0, %ymm15, %ymm0
1127 vmovupd 32(%r11), %ymm15
1128 vaddpd %ymm1, %ymm15, %ymm1
1129
1130 jmp 0f // return
1131
11321:
1133
1134 // alg==-1
1135 vmovupd 0(%r11), %ymm15
1136 vsubpd %ymm0, %ymm15, %ymm0
1137 vmovupd 32(%r11), %ymm15
1138 vsubpd %ymm1, %ymm15, %ymm1
1139
11400: // return
1141
1142#if MACRO_LEVEL>=1
1143 .endm
1144#else
1145 ret
1146
1147#if defined(OS_LINUX)
1148 .size inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
1149#endif
1150#endif
1151
1152
1153
1154
1155
1156// common inner routine with file scope
1157//
1158// store
1159//
1160// input arguments:
1161// r10 <- z
1162// ymm0 <- [z0 z1 z2 z3]
1163// ymm1 <- [z4 z5 z6 z7]
1164//
1165// output arguments:
1166// r10 <- z
1167// ymm0 <- [z0 z1 z2 z3]
1168// ymm1 <- [z4 z5 z6 z7]
1169
1170#if MACRO_LEVEL>=1
1171 .macro INNER_STORE_8_LIB4
1172#else
1173 .p2align 4,,15
1174#if defined(OS_LINUX)
1175 .type inner_store_8_lib4, @function
1176inner_store_8_lib4:
1177#elif defined(OS_MAC)
1178_inner_store_8_lib4:
1179#elif defined(OS_WINDOWS)
1180 .def inner_store_8_lib4; .scl 2; .type 32; .endef
1181inner_store_8_lib4:
1182#endif
1183#endif
1184
1185 vmovupd %ymm0, 0(%r10)
1186 vmovupd %ymm1, 32(%r10)
1187
1188#if MACRO_LEVEL>=1
1189 .endm
1190#else
1191 ret
1192
1193#if defined(OS_LINUX)
1194 .size inner_store_8_lib4, .-inner_store_8_lib4
1195#endif
1196#endif
1197
1198
1199
1200
1201
1202// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
1203// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
1204
1205 .p2align 4,,15
1206#if defined(OS_LINUX)
1207 .globl kernel_dgemv_n_8_lib4
1208 .type kernel_dgemv_n_8_lib4, @function
1209kernel_dgemv_n_8_lib4:
1210#elif defined(OS_MAC)
1211 .globl _kernel_dgemv_n_8_lib4
1212_kernel_dgemv_n_8_lib4:
1213#elif defined(OS_WINDOWS)
1214 .globl kernel_dgemv_n_8_lib4
1215 .def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
1216kernel_dgemv_n_8_lib4:
1217#endif
1218
1219 PROLOGUE
1220
1221 // zero accumulation registers
1222
1223 vxorpd %ymm0, %ymm0, %ymm0
1224 vmovapd %ymm0, %ymm1
1225 vmovapd %ymm0, %ymm2
1226 vmovapd %ymm0, %ymm3
1227
1228
1229 // call inner dgemv kernel n
1230
1231 movq ARG1, %r10 // k
1232 movq ARG3, %r11 // A
1233 movq ARG4, %r12 // sda
1234 sall $5, %r12d // 4*sda*sizeof(double)
1235// movslq %r12d, %r12
1236 movq ARG5, %r13 // x
1237
1238#if MACRO_LEVEL>=2
1239 INNER_KERNEL_DGEMV_ADD_N_8_LIB4
1240#else
1241#if defined(OS_LINUX) | defined(OS_WINDOWS)
1242 call inner_kernel_dgemv_add_n_8_lib4
1243#elif defined(OS_MAC)
1244 callq _inner_kernel_dgemv_add_n_8_lib4
1245#endif
1246#endif
1247
1248
1249 // call inner blend n scale ab
1250
1251 movq ARG2, %r10 // alpha
1252 movq ARG6, %r11 // beta
1253 movq ARG7, %r12 // y
1254
1255#if MACRO_LEVEL>=1
1256 INNER_BLEND_N_SCALE_AB_8_LIB4
1257#else
1258#if defined(OS_LINUX) | defined(OS_WINDOWS)
1259 call inner_blend_n_scale_ab_8_lib4
1260#elif defined(OS_MAC)
1261 callq _inner_blend_n_scale_ab_8_lib4
1262#endif
1263#endif
1264
1265
1266
1267 // store
1268
1269 movq ARG8, %r10 // z
1270
1271#if MACRO_LEVEL>=1
1272 INNER_STORE_8_LIB4
1273#else
1274#if defined(OS_LINUX) | defined(OS_WINDOWS)
1275 call inner_store_8_lib4
1276#elif defined(OS_MAC)
1277 callq _inner_store_8_lib4
1278#endif
1279#endif
1280
1281
1282 EPILOGUE
1283
1284 ret
1285
1286#if defined(OS_LINUX)
1287 .size kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
1288#endif
1289
1290
1291
1292
1293
1294// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
1295// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
1296
1297 .p2align 4,,15
1298#if defined(OS_LINUX)
1299 .globl kernel_dgemv_t_8_lib4
1300 .type kernel_dgemv_t_8_lib4, @function
1301kernel_dgemv_t_8_lib4:
1302#elif defined(OS_MAC)
1303 .globl _kernel_dgemv_t_8_lib4
1304_kernel_dgemv_t_8_lib4:
1305#elif defined(OS_WINDOWS)
1306 .globl kernel_dgemv_t_8_lib4
1307 .def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
1308kernel_dgemv_t_8_lib4:
1309#endif
1310
1311 PROLOGUE
1312
1313 // zero accumulation registers
1314
1315 vxorpd %ymm0, %ymm0, %ymm0
1316 vmovapd %ymm0, %ymm1
1317 vmovapd %ymm0, %ymm2
1318 vmovapd %ymm0, %ymm3
1319 vmovapd %ymm0, %ymm4
1320 vmovapd %ymm0, %ymm5
1321 vmovapd %ymm0, %ymm6
1322 vmovapd %ymm0, %ymm7
1323
1324
1325 // call inner dgemv kernel n
1326
1327 movq ARG1, %r10 // k
1328 movq ARG3, %r11 // A
1329 movq ARG4, %r12 // sda
1330 sall $5, %r12d // 4*sda*sizeof(double)
1331// movslq %r12d, %r12
1332 movq ARG5, %r13 // x
1333
1334#if MACRO_LEVEL>=2
1335 INNER_KERNEL_DGEMV_ADD_T_8_LIB4
1336#else
1337#if defined(OS_LINUX) | defined(OS_WINDOWS)
1338 call inner_kernel_dgemv_add_t_8_lib4
1339#elif defined(OS_MAC)
1340 callq _inner_kernel_dgemv_add_t_8_lib4
1341#endif
1342#endif
1343
1344
1345 // call inner blender t
1346
1347 movq ARG2, %r10 // alpha
1348 movq ARG6, %r11 // beta
1349 movq ARG7, %r12 // y
1350
1351#if MACRO_LEVEL>=1
1352 INNER_BLEND_T_SCALE_AB_8_LIB4
1353#else
1354#if defined(OS_LINUX) | defined(OS_WINDOWS)
1355 call inner_blend_t_scale_ab_8_lib4
1356#elif defined(OS_MAC)
1357 callq _inner_blend_t_scale_ab_8_lib4
1358#endif
1359#endif
1360
1361
1362 // store
1363
1364 movq ARG8, %r10 // z
1365
1366#if MACRO_LEVEL>=1
1367 INNER_STORE_8_LIB4
1368#else
1369#if defined(OS_LINUX) | defined(OS_WINDOWS)
1370 call inner_store_8_lib4
1371#elif defined(OS_MAC)
1372 callq _inner_store_8_lib4
1373#endif
1374#endif
1375
1376
1377 EPILOGUE
1378
1379 ret
1380
1381#if defined(OS_LINUX)
1382 .size kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
1383#endif
1384
1385
1386
1387
1388
1389// rdi rsi rdx rcx r8
1390// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
1391
1392 .p2align 4,,15
1393#if defined(OS_LINUX)
1394 .globl kernel_dtrmv_un_8_lib4
1395 .type kernel_dtrmv_un_8_lib4, @function
1396kernel_dtrmv_un_8_lib4:
1397#elif defined(OS_MAC)
1398 .globl _kernel_dtrmv_un_8_lib4
1399_kernel_dtrmv_un_8_lib4:
1400#elif defined(OS_WINDOWS)
1401 .globl kernel_dtrmv_un_8_lib4
1402 .def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
1403kernel_dtrmv_un_8_lib4:
1404#endif
1405
1406 PROLOGUE
1407
1408 // zero accumulation registers
1409
1410 vxorpd %ymm0, %ymm0, %ymm0
1411 vmovapd %ymm0, %ymm1
1412 vmovapd %ymm0, %ymm2
1413 vmovapd %ymm0, %ymm3
1414
1415
1416 // call inner dtrmv edge & dgemv kernel n
1417
1418 movq ARG1, %r10 // k
1419 movq ARG2, %r11 // A
1420 movq ARG3, %r12
1421 sall $5, %r12d // 4*sda*sizeof(double)
1422// movslq %r12d, %r12
1423 movq ARG4, %r13 // x
1424
1425
1426#if MACRO_LEVEL>=1
1427 INNER_EDGE_DTRMV_UN_8_LIB4
1428#else
1429#if defined(OS_LINUX) | defined(OS_WINDOWS)
1430 call inner_edge_dtrmv_un_8_lib4
1431#elif defined(OS_MAC)
1432 callq _inner_edge_dtrmv_un_8_lib4
1433#endif
1434#endif
1435
1436#if MACRO_LEVEL>=2
1437 INNER_KERNEL_DGEMV_ADD_N_8_LIB4
1438#else
1439#if defined(OS_LINUX) | defined(OS_WINDOWS)
1440 call inner_kernel_dgemv_add_n_8_lib4
1441#elif defined(OS_MAC)
1442 callq _inner_kernel_dgemv_add_n_8_lib4
1443#endif
1444#endif
1445
1446
1447 // call inner blender n
1448
1449#if MACRO_LEVEL>=1
1450 INNER_BLENDER_N_8_LIB4
1451#else
1452#if defined(OS_LINUX) | defined(OS_WINDOWS)
1453 call inner_blend_n_8_lib4
1454#elif defined(OS_MAC)
1455 callq _inner_blend_n_8_lib4
1456#endif
1457#endif
1458
1459
1460 // store
1461
1462 movq ARG5, %r10 // z
1463
1464#if MACRO_LEVEL>=1
1465 INNER_STORE_8_LIB4
1466#else
1467#if defined(OS_LINUX) | defined(OS_WINDOWS)
1468 call inner_store_8_lib4
1469#elif defined(OS_MAC)
1470 callq _inner_store_8_lib4
1471#endif
1472#endif
1473
1474
1475 EPILOGUE
1476
1477 ret
1478
1479#if defined(OS_LINUX)
1480 .size kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
1481#endif
1482
1483
1484
1485
1486
1487 // read-only data
1488#if defined(OS_LINUX)
1489 .section .rodata.cst32,"aM",@progbits,32
1490#elif defined(OS_MAC)
1491 .section __TEXT,__const
1492#elif defined(OS_WINDOWS)
1493 .section .rdata,"dr"
1494#endif
1495
1496#if defined(OS_LINUX) | defined(OS_WINDOWS)
1497 .align 32
1498.LC00: // { -1 -1 -1 1 }
1499#elif defined(OS_MAC)
1500LC00: // { -1 -1 -1 1 }
1501 .align 5
1502#endif
1503 .quad -1
1504 .quad -1
1505 .quad -1
1506 .quad 1
1507
1508#if defined(OS_LINUX) | defined(OS_WINDOWS)
1509 .align 32
1510.LC01: // { -1 -1 -1 -1 }
1511#elif defined(OS_MAC)
1512LC01: // { -1 -1 -1 -1 }
1513 .align 5
1514#endif
1515 .quad -1
1516 .quad -1
1517 .quad -1
1518 .quad -1
1519
1520#if defined(OS_LINUX) | defined(OS_WINDOWS)
1521 .align 32
1522.LC02: // { 3.5 2.5 1.5 0.5 }
1523#elif defined(OS_MAC)
1524LC02: // { 3.5 2.5 1.5 0.5 }
1525 .align 5
1526#endif
1527 .long 0
1528 .long 1071644672
1529 .long 0
1530 .long 1073217536
1531 .long 0
1532 .long 1074003968
1533 .long 0
1534 .long 1074528256
1535
1536#if defined(OS_LINUX) | defined(OS_WINDOWS)
1537 .align 32
1538.LC03: // { 7.5 6.5 5.5 4.5 }
1539#elif defined(OS_MAC)
1540LC03: // { 7.5 6.5 5.5 4.5 }
1541 .align 5
1542#endif
1543 .long 0
1544 .long 1074921472
1545 .long 0
1546 .long 1075183616
1547 .long 0
1548 .long 1075445760
1549 .long 0
1550 .long 1075707904
1551
1552#if defined(OS_LINUX) | defined(OS_WINDOWS)
1553 .align 32
1554.LC04: // { 1.0 1.0 1.0 1.0 }
1555#elif defined(OS_MAC)
1556LC04: // { 1.0 1.0 1.0 1.0 }
1557 .align 5
1558#endif
1559 .long 0
1560 .long 1072693248
1561 .long 0
1562 .long 1072693248
1563 .long 0
1564 .long 1072693248
1565 .long 0
1566 .long 1072693248
1567
1568
1569
1570#if defined(OS_LINUX)
1571 .section .note.GNU-stack,"",@progbits
1572#elif defined(OS_MAC)
1573 .subsections_via_symbols
1574#endif
1575