blob: 1c9185afa940be087e511a8efc1a56941d50017d [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#if defined(OS_LINUX) | defined(OS_MAC)
30
31//#define STACKSIZE 96
32#define STACKSIZE 64
33#define ARG1 %rdi
34#define ARG2 %rsi
35#define ARG3 %rdx
36#define ARG4 %rcx
37#define ARG5 %r8
38#define ARG6 %r9
39#define ARG7 STACKSIZE + 8(%rsp)
40#define ARG8 STACKSIZE + 16(%rsp)
41#define ARG9 STACKSIZE + 24(%rsp)
42#define ARG10 STACKSIZE + 32(%rsp)
43#define ARG11 STACKSIZE + 40(%rsp)
44#define ARG12 STACKSIZE + 48(%rsp)
45#define ARG13 STACKSIZE + 56(%rsp)
46#define ARG14 STACKSIZE + 64(%rsp)
47#define ARG15 STACKSIZE + 72(%rsp)
48#define ARG16 STACKSIZE + 80(%rsp)
49#define ARG17 STACKSIZE + 88(%rsp)
50#define ARG18 STACKSIZE + 96(%rsp)
51#define PROLOGUE \
52 subq $STACKSIZE, %rsp; \
53 movq %rbx, (%rsp); \
54 movq %rbp, 8(%rsp); \
55 movq %r12, 16(%rsp); \
56 movq %r13, 24(%rsp); \
57 movq %r14, 32(%rsp); \
58 movq %r15, 40(%rsp); \
59 vzeroupper;
60#define EPILOGUE \
61 vzeroupper; \
62 movq (%rsp), %rbx; \
63 movq 8(%rsp), %rbp; \
64 movq 16(%rsp), %r12; \
65 movq 24(%rsp), %r13; \
66 movq 32(%rsp), %r14; \
67 movq 40(%rsp), %r15; \
68 addq $STACKSIZE, %rsp;
69
70#elif defined(OS_WINDOWS)
71
72#define STACKSIZE 256
73#define ARG1 %rcx
74#define ARG2 %rdx
75#define ARG3 %r8
76#define ARG4 %r9
77#define ARG5 STACKSIZE + 40(%rsp)
78#define ARG6 STACKSIZE + 48(%rsp)
79#define ARG7 STACKSIZE + 56(%rsp)
80#define ARG8 STACKSIZE + 64(%rsp)
81#define ARG9 STACKSIZE + 72(%rsp)
82#define ARG10 STACKSIZE + 80(%rsp)
83#define ARG11 STACKSIZE + 88(%rsp)
84#define ARG12 STACKSIZE + 96(%rsp)
85#define ARG13 STACKSIZE + 104(%rsp)
86#define ARG14 STACKSIZE + 112(%rsp)
87#define ARG15 STACKSIZE + 120(%rsp)
88#define ARG16 STACKSIZE + 128(%rsp)
89#define ARG17 STACKSIZE + 136(%rsp)
90#define ARG18 STACKSIZE + 144(%rsp)
91#define PROLOGUE \
92 subq $STACKSIZE, %rsp; \
93 movq %rbx, (%rsp); \
94 movq %rbp, 8(%rsp); \
95 movq %r12, 16(%rsp); \
96 movq %r13, 24(%rsp); \
97 movq %r14, 32(%rsp); \
98 movq %r15, 40(%rsp); \
99 movq %rdi, 48(%rsp); \
100 movq %rsi, 56(%rsp); \
101 vmovups %xmm6, 64(%rsp); \
102 vmovups %xmm7, 80(%rsp); \
103 vmovups %xmm8, 96(%rsp); \
104 vmovups %xmm9, 112(%rsp); \
105 vmovups %xmm10, 128(%rsp); \
106 vmovups %xmm11, 144(%rsp); \
107 vmovups %xmm12, 160(%rsp); \
108 vmovups %xmm13, 176(%rsp); \
109 vmovups %xmm14, 192(%rsp); \
110 vmovups %xmm15, 208(%rsp); \
111 vzeroupper;
112#define EPILOGUE \
113 vzeroupper; \
114 movq (%rsp), %rbx; \
115 movq 8(%rsp), %rbp; \
116 movq 16(%rsp), %r12; \
117 movq 24(%rsp), %r13; \
118 movq 32(%rsp), %r14; \
119 movq 40(%rsp), %r15; \
120 movq 48(%rsp), %rdi; \
121 movq 56(%rsp), %rsi; \
122 vmovups 64(%rsp), %xmm6; \
123 vmovups 80(%rsp), %xmm7; \
124 vmovups 96(%rsp), %xmm8; \
125 vmovups 112(%rsp), %xmm9; \
126 vmovups 128(%rsp), %xmm10; \
127 vmovups 144(%rsp), %xmm11; \
128 vmovups 160(%rsp), %xmm12; \
129 vmovups 176(%rsp), %xmm13; \
130 vmovups 192(%rsp), %xmm14; \
131 vmovups 208(%rsp), %xmm15; \
132 addq $STACKSIZE, %rsp;
133
134#else
135
136#error wrong OS
137
138#endif
139
140
141
142#if defined(OS_LINUX) | defined(OS_WINDOWS)
143 .text
144#elif defined(OS_MAC)
145 .section __TEXT,__text,regular,pure_instructions
146#endif
147
148// common inner routine with file scope
149//
150// input arguments:
151// r10d <- k
152// r11 <- A
153// r12 <- 4*sda*sizeof(double)
154// r13 <- x
155// r15 <- dirty
156// ymm0 <- [z0 z1 z2 z3]_a
157// ymm1 <- [z4 z5 z6 z7]_a
158// ymm2 <- [z0 z1 z2 z3]_b
159// ymm3 <- [z4 z5 z6 z7]_b
160// ymm4 <- [z0 z1 z2 z3]_c
161// ymm5 <- [z4 z5 z6 z7]_c
162// ymm6 <- [z0 z1 z2 z3]_d
163// ymm7 <- [z4 z5 z6 z7]_d
164// ymm8 <- dirty
165// ymm9 <- dirty
166// ymm10 <- dirty
167// ymm11 <- dirty
168// ymm12 <- dirty
169// ymm13 <- dirty
170// ymm14 <- dirty
171// ymm15 <- dirty
172
173//
174// output arguments:
175// r10d <- 0
176// r11 <- A+4*k*sizeof(double)
177// r12 <- 4*sda*sizeof(double)
178// r13 <- x+k*sizeof(double)
179// r15 <- dirty
180// ymm0 <- [z0 z1 z2 z3]_a
181// ymm1 <- [z4 z5 z6 z7]_a
182// ymm2 <- [z0 z1 z2 z3]_b
183// ymm3 <- [z4 z5 z6 z7]_b
184// ymm4 <- [z0 z1 z2 z3]_c
185// ymm5 <- [z4 z5 z6 z7]_c
186// ymm6 <- [z0 z1 z2 z3]_d
187// ymm7 <- [z4 z5 z6 z7]_d
188// ymm8 <- dirty
189// ymm9 <- dirty
190// ymm10 <- dirty
191// ymm11 <- dirty
192// ymm12 <- dirty
193// ymm13 <- dirty
194// ymm14 <- dirty
195// ymm15 <- dirty
196
197#if MACRO_LEVEL>=2
198 .macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
199#else
200 .p2align 4,,15
201#if defined(OS_LINUX)
202 .type inner_kernel_dgemv_add_n_8_lib4, @function
203inner_kernel_dgemv_add_n_8_lib4:
204#elif defined(OS_MAC)
205_inner_kernel_dgemv_add_n_8_lib4:
206#elif defined(OS_WINDOWS)
207 .def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
208inner_kernel_dgemv_add_n_8_lib4:
209#endif
210#endif
211
212 cmpl $0, %r10d
213 jle 2f // return
214
215 movq %r11, %r15 // A1 <- A0
216 addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
217
218 cmpl $4, %r10d
219 jl 0f // clean-up loop
220
221 // main loop
222 .p2align 3
2231: // main loop
224
225 vbroadcastsd 0(%r13), %ymm12
226 vmovapd 0(%r11), %ymm8
227 vfmadd231pd %ymm8, %ymm12, %ymm0
228 vmovapd 0(%r15), %ymm8
229 vfmadd231pd %ymm8, %ymm12, %ymm1
230
231 subl $4, %r10d
232
233 vbroadcastsd 8(%r13), %ymm12
234 vmovapd 32(%r11), %ymm8
235 vfmadd231pd %ymm8, %ymm12, %ymm2
236 vmovapd 32(%r15), %ymm8
237 vfmadd231pd %ymm8, %ymm12, %ymm3
238
239 vbroadcastsd 16(%r13), %ymm12
240 vmovapd 64(%r11), %ymm8
241 vfmadd231pd %ymm8, %ymm12, %ymm4
242 vmovapd 64(%r15), %ymm8
243 vfmadd231pd %ymm8, %ymm12, %ymm5
244
245 vbroadcastsd 24(%r13), %ymm12
246 addq $32, %r13
247 vmovapd 96(%r11), %ymm8
248 addq $128, %r11
249 vfmadd231pd %ymm8, %ymm12, %ymm6
250 vmovapd 96(%r15), %ymm8
251 addq $128, %r15
252 vfmadd231pd %ymm8, %ymm12, %ymm7
253
254 cmpl $3, %r10d
255
256 jg 1b // main loop
257
258
259 // consider clean-up
260 cmpl $0, %r10d
261 jle 2f // return
262
2630: // clean-up
264
265 vbroadcastsd 0(%r13), %ymm12
266 vmovapd 0(%r11), %ymm8
267 vmulpd %ymm8, %ymm12, %ymm15
268 vaddpd %ymm0, %ymm15, %ymm0
269 vmovapd 0(%r15), %ymm8
270 vmulpd %ymm8, %ymm12, %ymm15
271 vaddpd %ymm1, %ymm15, %ymm1
272
273 addq $32, %r11
274 addq $32, %r15
275 addq $8, %r13
276
277 subl $1, %r10d
278 cmpl $0, %r10d
279
280 jg 0b // clean
281
2822: // return
283
284#if MACRO_LEVEL>=2
285 .endm
286#else
287 ret
288
289#if defined(OS_LINUX)
290 .size inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
291#endif
292#endif
293
294
295
296
297
298// common inner routine with file scope
299//
300// input arguments:
301// r10d <- k
302// r11 <- A
303// r12 <- bs*sda*sizeof(double) = 32*sda
304// r13 <- x
305// ymm0 <- [z0a z0b z0c z0d]
306// ymm1 <- [z1a z1b z1c z1d]
307// ymm2 <- [z2a z2b z2c z2d]
308// ymm3 <- [z3a z3b z3c z3d]
309// ymm4 <- [z4a z4b z4c z4d]
310// ymm5 <- [z5a z5b z5c z5d]
311// ymm6 <- [z6a z6b z6c z6d]
312// ymm7 <- [z7a z7b z7c z7d]
313// ymm8 <- dirty
314// ymm9 <- dirty
315// ymm10 <- dirty
316// ymm11 <- dirty
317// ymm12 <- dirty
318// ymm13 <- dirty
319// ymm14 <- dirty
320// ymm15 <- dirty
321
322//
323// output arguments:
324// r10d <- 0
325// r11 <- A+4*k*sizeof(double)
326// r12 <- bs*sda*sizeof(double) = 32*sda
327// r13 <- x+k*sizeof(double)
328// ymm0 <- [z0a z0b z0c z0d]
329// ymm1 <- [z1a z1b z1c z1d]
330// ymm2 <- [z2a z2b z2c z2d]
331// ymm3 <- [z3a z3b z3c z3d]
332// ymm4 <- [z4a z4b z4c z4d]
333// ymm5 <- [z5a z5b z5c z5d]
334// ymm6 <- [z6a z6b z6c z6d]
335// ymm7 <- [z7a z7b z7c z7d]
336// ymm8 <- dirty
337// ymm9 <- dirty
338// ymm10 <- dirty
339// ymm11 <- dirty
340// ymm12 <- dirty
341// ymm13 <- dirty
342// ymm14 <- dirty
343// ymm15 <- dirty
344
345#if MACRO_LEVEL>=2
346 .macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
347#else
348 .p2align 4,,15
349#if defined(OS_LINUX)
350 .type inner_kernel_dgemv_add_t_8_lib4, @function
351inner_kernel_dgemv_add_t_8_lib4:
352#elif defined(OS_MAC)
353_inner_kernel_dgemv_add_t_8_lib4:
354#elif defined(OS_WINDOWS)
355 .def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
356inner_kernel_dgemv_add_t_8_lib4:
357#endif
358#endif
359
360 cmpl $0, %r10d
361 jle 2f // return
362
363 cmpl $4, %r10d
364 jl 0f // clean-up loop
365
366 // main loop
367 .p2align 3
3681: // main loop
369
370 vmovupd 0(%r13), %ymm12
371
372 vmovapd 0(%r11), %ymm8
373 vfmadd231pd %ymm8, %ymm12, %ymm0
374
375 subl $4, %r10d
376
377 vmovapd 32(%r11), %ymm8
378 vfmadd231pd %ymm8, %ymm12, %ymm1
379
380 vmovapd 64(%r11), %ymm8
381 vfmadd231pd %ymm8, %ymm12, %ymm2
382
383 vmovapd 96(%r11), %ymm8
384 vfmadd231pd %ymm8, %ymm12, %ymm3
385
386 vmovapd 128(%r11), %ymm8
387 vfmadd231pd %ymm8, %ymm12, %ymm4
388
389 vmovapd 160(%r11), %ymm8
390 vfmadd231pd %ymm8, %ymm12, %ymm5
391
392 vmovapd 192(%r11), %ymm8
393 vfmadd231pd %ymm8, %ymm12, %ymm6
394
395 vmovapd 224(%r11), %ymm8
396 vfmadd231pd %ymm8, %ymm12, %ymm7
397
398 addq %r12, %r11
399 addq $32, %r13
400
401 cmpl $3, %r10d
402
403 jg 1b // main loop
404
405
406 // consider clean-up
407 cmpl $0, %r10d
408 jle 2f // return
409
4100: // clean-up
411
412 vcvtsi2sd %r10d, %xmm14, %xmm14
413#if defined(OS_LINUX) | defined(OS_WINDOWS)
414 vmovupd .LC02(%rip), %ymm13
415#elif defined(OS_MAC)
416 vmovupd LC02(%rip), %ymm13
417#endif
418 vmovddup %xmm14, %xmm14
419 vinsertf128 $1, %xmm14, %ymm14, %ymm14
420 vsubpd %ymm14, %ymm13, %ymm14
421
422 vmaskmovpd 0(%r13), %ymm14, %ymm12
423
424 vmovapd 0(%r11), %ymm8
425 vfmadd231pd %ymm8, %ymm12, %ymm0
426
427 vmovapd 32(%r11), %ymm8
428 vfmadd231pd %ymm8, %ymm12, %ymm1
429
430 vmovapd 64(%r11), %ymm8
431 vfmadd231pd %ymm8, %ymm12, %ymm2
432
433 vmovapd 96(%r11), %ymm8
434 vfmadd231pd %ymm8, %ymm12, %ymm3
435
436 vmovapd 128(%r11), %ymm8
437 vfmadd231pd %ymm8, %ymm12, %ymm4
438
439 vmovapd 160(%r11), %ymm8
440 vfmadd231pd %ymm8, %ymm12, %ymm5
441
442 vmovapd 192(%r11), %ymm8
443 vfmadd231pd %ymm8, %ymm12, %ymm6
444
445 vmovapd 224(%r11), %ymm8
446 vfmadd231pd %ymm8, %ymm12, %ymm7
447
448 sall $3, %r10d
449// movslq %r10d, %r10
450 addq %r10, %r11
451 addq %r10, %r13
452 xorl %r10d, %r10d
453
454
4552: // return
456
457#if MACRO_LEVEL>=2
458 .endm
459#else
460 ret
461
462#if defined(OS_LINUX)
463 .size inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
464#endif
465#endif
466
467
468
469
470
471// common inner routine with file scope
472//
473// input arguments:
474// r10d <- k
475// r11 <- A
476// r12 <- 4*sda*sizeof(double)
477// r13 <- x
478// r15 <- dirty
479// ymm0 <- [z0 z1 z2 z3]_a
480// ymm1 <- [z4 z5 z6 z7]_a
481// ymm2 <- [z0 z1 z2 z3]_b
482// ymm3 <- [z4 z5 z6 z7]_b
483// ymm4 <- [z0 z1 z2 z3]_c
484// ymm5 <- [z4 z5 z6 z7]_c
485// ymm6 <- [z0 z1 z2 z3]_d
486// ymm7 <- [z4 z5 z6 z7]_d
487// ymm8 <- dirty
488// ymm9 <- dirty
489// ymm10 <- dirty
490// ymm11 <- dirty
491// ymm12 <- dirty
492// ymm13 <- dirty
493// ymm14 <- dirty
494// ymm15 <- dirty
495
496//
497// output arguments:
498// r10d <- k-4
499// r11 <- A+4*4*sizeof(double)
500// r12 <- 4*sda*sizeof(double)
501// r13 <- x+4*sizeof(double)
502// r15 <- dirty
503// ymm0 <- [z0 z1 z2 z3]_a
504// ymm1 <- [z4 z5 z6 z7]_a
505// ymm2 <- [z0 z1 z2 z3]_b
506// ymm3 <- [z4 z5 z6 z7]_b
507// ymm4 <- [z0 z1 z2 z3]_c
508// ymm5 <- [z4 z5 z6 z7]_c
509// ymm6 <- [z0 z1 z2 z3]_d
510// ymm7 <- [z4 z5 z6 z7]_d
511// ymm8 <- dirty
512// ymm9 <- dirty
513// ymm10 <- dirty
514// ymm11 <- dirty
515// ymm12 <- dirty
516// ymm13 <- dirty
517// ymm14 <- dirty
518// ymm15 <- dirty
519
520#if MACRO_LEVEL>=1
521 .macro INNER_EDGE_DTRMV_UN_8_LIB4
522#else
523 .p2align 4,,15
524#if defined(OS_LINUX)
525 .type inner_edge_dtrmv_un_8_lib4, @function
526inner_edge_dtrmv_un_8_lib4:
527#elif defined(OS_MAC)
528_inner_edge_dtrmv_un_8_lib4:
529#elif defined(OS_WINDOWS)
530 .def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
531inner_edge_dtrmv_un_8_lib4:
532#endif
533#endif
534
535 movq %r11, %r15 // A1 <- A0
536 addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
537
538 vxorpd %ymm14, %ymm14, %ymm14
539
540 // first 4 columns
541 vmovapd 0(%r11), %ymm8
542 vblendpd $0x1, %ymm8, %ymm14, %ymm8
543 vbroadcastsd 0(%r13), %ymm12
544 vfmadd231pd %ymm8, %ymm12, %ymm0
545
546 subl $4, %r10d
547
548 vmovapd 32(%r11), %ymm8
549 vblendpd $0x3, %ymm8, %ymm14, %ymm8
550 vbroadcastsd 8(%r13), %ymm12
551 vfmadd231pd %ymm8, %ymm12, %ymm2
552
553 vmovapd 64(%r11), %ymm8
554 vblendpd $0x7, %ymm8, %ymm14, %ymm8
555 vbroadcastsd 16(%r13), %ymm12
556 vfmadd231pd %ymm8, %ymm12, %ymm4
557
558 vmovapd 96(%r11), %ymm8
559 vbroadcastsd 24(%r13), %ymm12
560 vfmadd231pd %ymm8, %ymm12, %ymm6
561
562 addq $128, %r11
563 addq $128, %r15
564 addq $32, %r13
565
566
567
568 // last 4 columns
569 vbroadcastsd 0(%r13), %ymm12
570 vmovapd 0(%r11), %ymm8
571 vfmadd231pd %ymm8, %ymm12, %ymm0
572 vmovapd 0(%r15), %ymm8
573 vblendpd $0x1, %ymm8, %ymm14, %ymm8
574 vfmadd231pd %ymm8, %ymm12, %ymm1
575
576 subl $4, %r10d
577
578 vbroadcastsd 8(%r13), %ymm12
579 vmovapd 32(%r11), %ymm8
580 vfmadd231pd %ymm8, %ymm12, %ymm2
581 vmovapd 32(%r15), %ymm8
582 vblendpd $0x3, %ymm8, %ymm14, %ymm8
583 vfmadd231pd %ymm8, %ymm12, %ymm3
584
585 vbroadcastsd 16(%r13), %ymm12
586 vmovapd 64(%r11), %ymm8
587 vfmadd231pd %ymm8, %ymm12, %ymm4
588 vmovapd 64(%r15), %ymm8
589 vblendpd $0x7, %ymm8, %ymm14, %ymm8
590 vfmadd231pd %ymm8, %ymm12, %ymm5
591
592 vbroadcastsd 24(%r13), %ymm12
593 vmovapd 96(%r11), %ymm8
594 vfmadd231pd %ymm8, %ymm12, %ymm6
595 vmovapd 96(%r15), %ymm8
596 vfmadd231pd %ymm8, %ymm12, %ymm7
597
598 addq $128, %r11
599 addq $128, %r15
600 addq $32, %r13
601
602
603#if MACRO_LEVEL>=1
604 .endm
605#else
606 ret
607
608#if defined(OS_LINUX)
609 .size inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
610#endif
611#endif
612
613
614
615
616
617// common inner routine with file scope
618//
619// blend for ta==n
620//
621// input arguments:
622// ymm0 <- [z0 z1 z2 z3]_a
623// ymm1 <- [z4 z5 z6 z7]_a
624// ymm2 <- [z0 z1 z2 z3]_b
625// ymm3 <- [z4 z5 z6 z7]_b
626// ymm8 <- dirty
627// ymm9 <- dirty
628// ymm10 <- dirty
629// ymm11 <- dirty
630// ymm15 <- dirty
631//
632// output arguments:
633// ymm0 <- [z0 z1 z2 z3]
634// ymm1 <- [z4 z5 z6 z7]
635// ymm2 <- dirty
636// ymm3 <- dirty
637// ymm8 <- dirty
638// ymm9 <- dirty
639// ymm10 <- dirty
640// ymm11 <- dirty
641// ymm15 <- dirty
642
643#if MACRO_LEVEL>=1
644 .macro INNER_BLEND_N_8_LIB4
645#else
646 .p2align 4,,15
647#if defined(OS_LINUX)
648 .type inner_blend_n_8_lib4, @function
649inner_blend_n_8_lib4:
650#elif defined(OS_MAC)
651_inner_blend_n_8_lib4:
652#elif defined(OS_WINDOWS)
653 .def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
654inner_blend_n_8_lib4:
655#endif
656#endif
657
658 // reduction
659 vaddpd %ymm0, %ymm2, %ymm0
660 vaddpd %ymm1, %ymm3, %ymm1
661
662#if MACRO_LEVEL>=1
663 .endm
664#else
665 ret
666
667#if defined(OS_LINUX)
668 .size inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
669#endif
670#endif
671
672
673
674
675
676// common inner routine with file scope
677//
678// blend for ta==t
679//
680// input arguments:
681// ymm0 <- [z0a z0b z0c z0d]
682// ymm1 <- [z1a z1b z1c z1d]
683// ymm2 <- [z2a z2b z2c z2d]
684// ymm3 <- [z3a z3b z3c z3d]
685// ymm4 <- [z4a z4b z4c z4d]
686// ymm5 <- [z5a z5b z5c z5d]
687// ymm6 <- [z6a z6b z6c z6d]
688// ymm7 <- [z7a z7b z7c z7d]
689// ymm8 <- dirty
690// ymm9 <- dirty
691// ymm10 <- dirty
692// ymm11 <- dirty
693// ymm15 <- dirty
694//
695// output arguments:
696// ymm0 <- [z0 z1 z2 z3]
697// ymm1 <- [z4 z5 z6 z7]
698// ymm2 <- dirty
699// ymm3 <- dirty
700// ymm8 <- dirty
701// ymm9 <- dirty
702// ymm10 <- dirty
703// ymm11 <- dirty
704// ymm15 <- dirty
705
706#if MACRO_LEVEL>=1
707 .macro INNER_BLEND_T_8_LIB4
708#else
709 .p2align 4,,15
710#if defined(OS_LINUX)
711 .type inner_blend_t_8_lib4, @function
712inner_blend_t_8_lib4:
713#elif defined(OS_MAC)
714_inner_blend_t_8_lib4:
715#elif defined(OS_WINDOWS)
716 .def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
717inner_blend_t_8_lib4:
718#endif
719#endif
720
721 // reduction
722 vhaddpd %ymm1, %ymm0, %ymm0
723 vhaddpd %ymm5, %ymm4, %ymm4
724 vhaddpd %ymm3, %ymm2, %ymm2
725 vhaddpd %ymm7, %ymm6, %ymm6
726 vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
727 vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
728 vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
729 vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
730 vaddpd %ymm0, %ymm3, %ymm0
731 vaddpd %ymm4, %ymm5, %ymm1
732
733#if MACRO_LEVEL>=1
734 .endm
735#else
736 ret
737
738#if defined(OS_LINUX)
739 .size inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
740#endif
741#endif
742
743
744
745
746
747// common inner routine with file scope
748//
749// blend for ta==n, scale for generic alpha and beta
750//
751// input arguments:
752// r10 <- alpha
753// r11 <- beta
754// r12 <- y
755// ymm0 <- [z0 z1 z2 z3]_a
756// ymm1 <- [z4 z5 z6 z7]_a
757// ymm2 <- [z0 z1 z2 z3]_b
758// ymm3 <- [z4 z5 z6 z7]_b
759// ymm4 <- [z0 z1 z2 z3]_c
760// ymm5 <- [z4 z5 z6 z7]_c
761// ymm6 <- [z0 z1 z2 z3]_d
762// ymm7 <- [z4 z5 z6 z7]_d
763// ymm8 <- dirty
764// ymm9 <- dirty
765// ymm10 <- dirty
766// ymm11 <- dirty
767// ymm15 <- dirty
768//
769// output arguments:
770// r10 <- alpha
771// r11 <- beta
772// r12 <- y
773// ymm0 <- [z0 z1 z2 z3]
774// ymm1 <- [z4 z5 z6 z7]
775// ymm2 <- dirty
776// ymm3 <- dirty
777// ymm8 <- dirty
778// ymm9 <- dirty
779// ymm10 <- dirty
780// ymm11 <- dirty
781// ymm15 <- dirty
782
783#if MACRO_LEVEL>=1
784 .macro INNER_BLEND_N_SCALE_AB_8_LIB4
785#else
786 .p2align 4,,15
787#if defined(OS_LINUX)
788 .type inner_blend_n_scale_ab_8_lib4, @function
789inner_blend_n_scale_ab_8_lib4:
790#elif defined(OS_MAC)
791_inner_blend_n_scale_ab_8_lib4:
792#elif defined(OS_WINDOWS)
793 .def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
794inner_blend_n_scale_ab_8_lib4:
795#endif
796#endif
797
798 // reduction
799 vaddpd %ymm0, %ymm2, %ymm0
800 vaddpd %ymm1, %ymm3, %ymm1
801 vaddpd %ymm4, %ymm6, %ymm4
802 vaddpd %ymm5, %ymm7, %ymm5
803 vaddpd %ymm0, %ymm4, %ymm0
804 vaddpd %ymm1, %ymm5, %ymm1
805
806 // alpha
807 vbroadcastsd 0(%r10), %ymm15
808 vmulpd %ymm0, %ymm15, %ymm0
809 vmulpd %ymm1, %ymm15, %ymm1
810
811 // beta
812 vbroadcastsd 0(%r11), %ymm15
813 vmovupd 0(%r12), %ymm14
814 vfmadd231pd %ymm15, %ymm14, %ymm0
815 vmovupd 32(%r12), %ymm14
816 vfmadd231pd %ymm15, %ymm14, %ymm1
817
818#if MACRO_LEVEL>=1
819 .endm
820#else
821 ret
822
823#if defined(OS_LINUX)
824 .size inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
825#endif
826#endif
827
828
829
830
831
832// common inner routine with file scope
833//
834// blend for ta==t, scale for generic alpha and beta
835//
836// input arguments:
837// r10 <- alpha
838// r11 <- beta
839// r12 <- y
840// ymm0 <- [z0a z0b z0c z0d]
841// ymm1 <- [z1a z1b z1c z1d]
842// ymm2 <- [z2a z2b z2c z2d]
843// ymm3 <- [z3a z3b z3c z3d]
844// ymm4 <- [z4a z4b z4c z4d]
845// ymm5 <- [z5a z5b z5c z5d]
846// ymm6 <- [z6a z6b z6c z6d]
847// ymm7 <- [z7a z7b z7c z7d]
848// ymm8 <- dirty
849// ymm9 <- dirty
850// ymm10 <- dirty
851// ymm11 <- dirty
852// ymm15 <- dirty
853//
854// output arguments:
855// r10 <- alpha
856// r11 <- beta
857// r12 <- y
858// ymm0 <- [z0 z1 z2 z3]
859// ymm1 <- [z4 z5 z6 z7]
860// ymm2 <- dirty
861// ymm3 <- dirty
862// ymm8 <- dirty
863// ymm9 <- dirty
864// ymm10 <- dirty
865// ymm11 <- dirty
866// ymm15 <- dirty
867
868#if MACRO_LEVEL>=1
869 .macro INNER_BLEND_T_SCALE_AB_8_LIB4
870#else
871 .p2align 4,,15
872#if defined(OS_LINUX)
873 .type inner_blend_t_scale_ab_8_lib4, @function
874inner_blend_t_scale_ab_8_lib4:
875#elif defined(OS_MAC)
876_inner_blend_t_scale_ab_8_lib4:
877#elif defined(OS_WINDOWS)
878 .def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
879inner_blend_t_scale_ab_8_lib4:
880#endif
881#endif
882
883 // reduction
884 vhaddpd %ymm1, %ymm0, %ymm0
885 vhaddpd %ymm5, %ymm4, %ymm4
886 vhaddpd %ymm3, %ymm2, %ymm2
887 vhaddpd %ymm7, %ymm6, %ymm6
888 vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
889 vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
890 vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
891 vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
892 vaddpd %ymm0, %ymm3, %ymm0
893 vaddpd %ymm4, %ymm5, %ymm1
894
895 // alpha
896 vbroadcastsd 0(%r10), %ymm15
897 vmulpd %ymm0, %ymm15, %ymm0
898 vmulpd %ymm1, %ymm15, %ymm1
899
900 // beta
901 vbroadcastsd 0(%r11), %ymm15
902 vmovupd 0(%r12), %ymm14
903 vfmadd231pd %ymm15, %ymm14, %ymm0
904 vmovupd 32(%r12), %ymm14
905 vfmadd231pd %ymm15, %ymm14, %ymm1
906
907#if MACRO_LEVEL>=1
908 .endm
909#else
910 ret
911
912#if defined(OS_LINUX)
913 .size inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
914#endif
915#endif
916
917
918
919
920
921// common inner routine with file scope
922//
923// blender for ta==n
924//
925// input arguments:
926// r10d <- alg
927// r11 <- y
928// ymm0 <- [z0 z1 z2 z3]_a
929// ymm1 <- [z4 z5 z6 z7]_a
930// ymm2 <- [z0 z1 z2 z3]_b
931// ymm3 <- [z4 z5 z6 z7]_b
932// ymm4 <- [z0 z1 z2 z3]_c
933// ymm5 <- [z4 z5 z6 z7]_c
934// ymm6 <- [z0 z1 z2 z3]_d
935// ymm7 <- [z4 z5 z6 z7]_d
936// ymm8 <- dirty
937// ymm9 <- dirty
938// ymm10 <- dirty
939// ymm11 <- dirty
940// ymm15 <- dirty
941//
942// output arguments:
943// r10d <- alg
944// r11 <- y
945// ymm0 <- [z0 z1 z2 z3]
946// ymm1 <- [z4 z5 z6 z7]
947// ymm2 <- dirty
948// ymm3 <- dirty
949// ymm8 <- dirty
950// ymm9 <- dirty
951// ymm10 <- dirty
952// ymm11 <- dirty
953// ymm15 <- dirty
954
955#if MACRO_LEVEL>=1
956 .macro INNER_BLENDER_N_8_LIB4
957#else
958 .p2align 4,,15
959#if defined(OS_LINUX)
960 .type inner_blender_n_8_lib4, @function
961inner_blender_n_8_lib4:
962#elif defined(OS_MAC)
963_inner_blender_n_8_lib4:
964#elif defined(OS_WINDOWS)
965 .def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
966inner_blender_n_8_lib4:
967#endif
968#endif
969
970 // reduction
971 vaddpd %ymm0, %ymm2, %ymm0
972 vaddpd %ymm1, %ymm3, %ymm1
973 vaddpd %ymm4, %ymm6, %ymm4
974 vaddpd %ymm5, %ymm7, %ymm5
975 vaddpd %ymm0, %ymm4, %ymm0
976 vaddpd %ymm1, %ymm5, %ymm1
977
978 cmpl $0, %r10d // alg
979 je 0f // return
980
981 cmpl $1, %r10d // alg
982 jne 1f // alg==-1
983
984 // alg==1
985 vmovupd 0(%r11), %ymm15
986 vaddpd %ymm0, %ymm15, %ymm0
987 vmovupd 32(%r11), %ymm15
988 vaddpd %ymm1, %ymm15, %ymm1
989
990 jmp 0f // return
991
9921:
993
994 // alg==-1
995 vmovupd 0(%r11), %ymm15
996 vsubpd %ymm0, %ymm15, %ymm0
997 vmovupd 32(%r11), %ymm15
998 vsubpd %ymm1, %ymm15, %ymm1
999
10000: // return
1001
1002#if MACRO_LEVEL>=1
1003 .endm
1004#else
1005 ret
1006
1007#if defined(OS_LINUX)
1008 .size inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
1009#endif
1010#endif
1011
1012
1013
1014
1015
1016// common inner routine with file scope
1017//
1018// blender for ta==t
1019//
1020// input arguments:
1021// r10d <- alg
1022// r11 <- y
1023// ymm0 <- [z0a z0b z0c z0d]
1024// ymm1 <- [z1a z1b z1c z1d]
1025// ymm2 <- [z2a z2b z2c z2d]
1026// ymm3 <- [z3a z3b z3c z3d]
1027// ymm0 <- [z4a z4b z4c z4d]
1028// ymm1 <- [z5a z5b z5c z5d]
1029// ymm2 <- [z6a z6b z6c z6d]
1030// ymm3 <- [z7a z7b z7c z7d]
1031// ymm8 <- dirty
1032// ymm9 <- dirty
1033// ymm10 <- dirty
1034// ymm11 <- dirty
1035// ymm15 <- dirty
1036//
1037// output arguments:
1038// r10d <- alg
1039// r11 <- y
1040// ymm0 <- [z0 z1 z2 z3]
1041// ymm1 <- [z4 z5 z6 z7]
1042// ymm1 <- dirty
1043// ymm2 <- dirty
1044// ymm3 <- dirty
1045// ymm8 <- dirty
1046// ymm9 <- dirty
1047// ymm10 <- dirty
1048// ymm11 <- dirty
1049// ymm15 <- dirty
1050
1051#if MACRO_LEVEL>=1
1052 .macro INNER_BLENDER_T_8_LIB4
1053#else
1054 .p2align 4,,15
1055#if defined(OS_LINUX)
1056 .type inner_blender_t_8_lib4, @function
1057inner_blender_t_8_lib4:
1058#elif defined(OS_MAC)
1059_inner_blender_t_8_lib4:
1060#elif defined(OS_WINDOWS)
1061 .def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
1062inner_blender_t_8_lib4:
1063#endif
1064#endif
1065
1066 // reduction
1067 vhaddpd %ymm1, %ymm0, %ymm0
1068 vhaddpd %ymm5, %ymm4, %ymm4
1069 vhaddpd %ymm3, %ymm2, %ymm2
1070 vhaddpd %ymm7, %ymm6, %ymm6
1071 vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
1072 vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
1073 vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
1074 vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
1075 vaddpd %ymm0, %ymm3, %ymm0
1076 vaddpd %ymm4, %ymm5, %ymm1
1077
1078 cmpl $0, %r10d // alg
1079 je 0f // return
1080
1081 cmpl $1, %r10d // alg
1082 jne 1f // alg==-1
1083
1084 // alg==1
1085 vmovupd 0(%r11), %ymm15
1086 vaddpd %ymm0, %ymm15, %ymm0
1087 vmovupd 32(%r11), %ymm15
1088 vaddpd %ymm1, %ymm15, %ymm1
1089
1090 jmp 0f // return
1091
10921:
1093
1094 // alg==-1
1095 vmovupd 0(%r11), %ymm15
1096 vsubpd %ymm0, %ymm15, %ymm0
1097 vmovupd 32(%r11), %ymm15
1098 vsubpd %ymm1, %ymm15, %ymm1
1099
11000: // return
1101
1102#if MACRO_LEVEL>=1
1103 .endm
1104#else
1105 ret
1106
1107#if defined(OS_LINUX)
1108 .size inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
1109#endif
1110#endif
1111
1112
1113
1114
1115
1116// common inner routine with file scope
1117//
1118// store
1119//
1120// input arguments:
1121// r10 <- z
1122// ymm0 <- [z0 z1 z2 z3]
1123// ymm1 <- [z4 z5 z6 z7]
1124//
1125// output arguments:
1126// r10 <- z
1127// ymm0 <- [z0 z1 z2 z3]
1128// ymm1 <- [z4 z5 z6 z7]
1129
1130#if MACRO_LEVEL>=1
1131 .macro INNER_STORE_8_LIB4
1132#else
1133 .p2align 4,,15
1134#if defined(OS_LINUX)
1135 .type inner_store_8_lib4, @function
1136inner_store_8_lib4:
1137#elif defined(OS_MAC)
1138_inner_store_8_lib4:
1139#elif defined(OS_WINDOWS)
1140 .def inner_store_8_lib4; .scl 2; .type 32; .endef
1141inner_store_8_lib4:
1142#endif
1143#endif
1144
1145 vmovupd %ymm0, 0(%r10)
1146 vmovupd %ymm1, 32(%r10)
1147
1148#if MACRO_LEVEL>=1
1149 .endm
1150#else
1151 ret
1152
1153#if defined(OS_LINUX)
1154 .size inner_store_8_lib4, .-inner_store_8_lib4
1155#endif
1156#endif
1157
1158
1159
1160
1161
1162// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
1163// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
1164
1165 .p2align 4,,15
1166#if defined(OS_LINUX)
1167 .globl kernel_dgemv_n_8_lib4
1168 .type kernel_dgemv_n_8_lib4, @function
1169kernel_dgemv_n_8_lib4:
1170#elif defined(OS_MAC)
1171 .globl _kernel_dgemv_n_8_lib4
1172_kernel_dgemv_n_8_lib4:
1173#elif defined(OS_WINDOWS)
1174 .globl kernel_dgemv_n_8_lib4
1175 .def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
1176kernel_dgemv_n_8_lib4:
1177#endif
1178
1179 PROLOGUE
1180
1181 // zero accumulation registers
1182
1183 vxorpd %ymm0, %ymm0, %ymm0
1184 vmovapd %ymm0, %ymm1
1185 vmovapd %ymm0, %ymm2
1186 vmovapd %ymm0, %ymm3
1187 vmovapd %ymm0, %ymm4
1188 vmovapd %ymm0, %ymm5
1189 vmovapd %ymm0, %ymm6
1190 vmovapd %ymm0, %ymm7
1191
1192
1193 // call inner dgemv kernel n
1194
1195 movq ARG1, %r10 // k
1196 movq ARG3, %r11 // A
1197 movq ARG4, %r12 // sda
1198 sall $5, %r12d // 4*sda*sizeof(double)
1199// movslq %r12d, %r12
1200 movq ARG5, %r13 // x
1201
1202#if MACRO_LEVEL>=2
1203 INNER_KERNEL_DGEMV_ADD_N_8_LIB4
1204#else
1205#if defined(OS_LINUX) | defined(OS_WINDOWS)
1206 call inner_kernel_dgemv_add_n_8_lib4
1207#elif defined(OS_MAC)
1208 callq _inner_kernel_dgemv_add_n_8_lib4
1209#endif
1210#endif
1211
1212
1213 // call inner blend n scale ab
1214
1215 movq ARG2, %r10 // alpha
1216 movq ARG6, %r11 // beta
1217 movq ARG7, %r12 // y
1218
1219#if MACRO_LEVEL>=1
1220 INNER_BLEND_N_SCALE_AB_8_LIB4
1221#else
1222#if defined(OS_LINUX) | defined(OS_WINDOWS)
1223 call inner_blend_n_scale_ab_8_lib4
1224#elif defined(OS_MAC)
1225 callq _inner_blend_n_scale_ab_8_lib4
1226#endif
1227#endif
1228
1229
1230
1231 // store
1232
1233 movq ARG8, %r10 // z
1234
1235#if MACRO_LEVEL>=1
1236 INNER_STORE_8_LIB4
1237#else
1238#if defined(OS_LINUX) | defined(OS_WINDOWS)
1239 call inner_store_8_lib4
1240#elif defined(OS_MAC)
1241 callq _inner_store_8_lib4
1242#endif
1243#endif
1244
1245
1246 EPILOGUE
1247
1248 ret
1249
1250#if defined(OS_LINUX)
1251 .size kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
1252#endif
1253
1254
1255
1256
1257
1258// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
1259// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
1260
1261 .p2align 4,,15
1262#if defined(OS_LINUX)
1263 .globl kernel_dgemv_t_8_lib4
1264 .type kernel_dgemv_t_8_lib4, @function
1265kernel_dgemv_t_8_lib4:
1266#elif defined(OS_MAC)
1267 .globl _kernel_dgemv_t_8_lib4
1268_kernel_dgemv_t_8_lib4:
1269#elif defined(OS_WINDOWS)
1270 .globl kernel_dgemv_t_8_lib4
1271 .def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
1272kernel_dgemv_t_8_lib4:
1273#endif
1274
1275 PROLOGUE
1276
1277 // zero accumulation registers
1278
1279 vxorpd %ymm0, %ymm0, %ymm0
1280 vmovapd %ymm0, %ymm1
1281 vmovapd %ymm0, %ymm2
1282 vmovapd %ymm0, %ymm3
1283 vmovapd %ymm0, %ymm4
1284 vmovapd %ymm0, %ymm5
1285 vmovapd %ymm0, %ymm6
1286 vmovapd %ymm0, %ymm7
1287
1288
1289 // call inner dgemv kernel n
1290
1291 movq ARG1, %r10 // k
1292 movq ARG3, %r11 // A
1293 movq ARG4, %r12 // sda
1294 sall $5, %r12d // 4*sda*sizeof(double)
1295// movslq %r12d, %r12
1296 movq ARG5, %r13 // x
1297
1298#if MACRO_LEVEL>=2
1299 INNER_KERNEL_DGEMV_ADD_T_8_LIB4
1300#else
1301#if defined(OS_LINUX) | defined(OS_WINDOWS)
1302 call inner_kernel_dgemv_add_t_8_lib4
1303#elif defined(OS_MAC)
1304 callq _inner_kernel_dgemv_add_t_8_lib4
1305#endif
1306#endif
1307
1308
1309 // call inner blender t
1310
1311 movq ARG2, %r10 // alpha
1312 movq ARG6, %r11 // beta
1313 movq ARG7, %r12 // y
1314
1315#if MACRO_LEVEL>=1
1316 INNER_BLEND_T_SCALE_AB_8_LIB4
1317#else
1318#if defined(OS_LINUX) | defined(OS_WINDOWS)
1319 call inner_blend_t_scale_ab_8_lib4
1320#elif defined(OS_MAC)
1321 callq _inner_blend_t_scale_ab_8_lib4
1322#endif
1323#endif
1324
1325
1326 // store
1327
1328 movq ARG8, %r10 // z
1329
1330#if MACRO_LEVEL>=1
1331 INNER_STORE_8_LIB4
1332#else
1333#if defined(OS_LINUX) | defined(OS_WINDOWS)
1334 call inner_store_8_lib4
1335#elif defined(OS_MAC)
1336 callq _inner_store_8_lib4
1337#endif
1338#endif
1339
1340
1341 EPILOGUE
1342
1343 ret
1344
1345#if defined(OS_LINUX)
1346 .size kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
1347#endif
1348
1349
1350
1351
1352
1353// rdi rsi rdx rcx r8
1354// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
1355
1356 .p2align 4,,15
1357#if defined(OS_LINUX)
1358 .globl kernel_dtrmv_un_8_lib4
1359 .type kernel_dtrmv_un_8_lib4, @function
1360kernel_dtrmv_un_8_lib4:
1361#elif defined(OS_MAC)
1362 .globl _kernel_dtrmv_un_8_lib4
1363_kernel_dtrmv_un_8_lib4:
1364#elif defined(OS_WINDOWS)
1365 .globl kernel_dtrmv_un_8_lib4
1366 .def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
1367kernel_dtrmv_un_8_lib4:
1368#endif
1369
1370 PROLOGUE
1371
1372 // zero accumulation registers
1373
1374 vxorpd %ymm0, %ymm0, %ymm0
1375 vmovapd %ymm0, %ymm1
1376 vmovapd %ymm0, %ymm2
1377 vmovapd %ymm0, %ymm3
1378 vmovapd %ymm0, %ymm4
1379 vmovapd %ymm0, %ymm5
1380 vmovapd %ymm0, %ymm6
1381 vmovapd %ymm0, %ymm7
1382
1383
1384 // call inner dtrmv edge & dgemv kernel n
1385
1386 movq ARG1, %r10 // k
1387 movq ARG2, %r11 // A
1388 movq ARG3, %r12
1389 sall $5, %r12d // 4*sda*sizeof(double)
1390// movslq %r12d, %r12
1391 movq ARG4, %r13 // x
1392
1393
1394#if MACRO_LEVEL>=1
1395 INNER_EDGE_DTRMV_UN_8_LIB4
1396#else
1397#if defined(OS_LINUX) | defined(OS_WINDOWS)
1398 call inner_edge_dtrmv_un_8_lib4
1399#elif defined(OS_MAC)
1400 callq _inner_edge_dtrmv_un_8_lib4
1401#endif
1402#endif
1403
1404#if MACRO_LEVEL>=2
1405 INNER_KERNEL_DGEMV_ADD_N_8_LIB4
1406#else
1407#if defined(OS_LINUX) | defined(OS_WINDOWS)
1408 call inner_kernel_dgemv_add_n_8_lib4
1409#elif defined(OS_MAC)
1410 callq _inner_kernel_dgemv_add_n_8_lib4
1411#endif
1412#endif
1413
1414
1415 // call inner blender n
1416
1417#if MACRO_LEVEL>=1
1418 INNER_BLENDER_N_8_LIB4
1419#else
1420#if defined(OS_LINUX) | defined(OS_WINDOWS)
1421 call inner_blend_n_8_lib4
1422#elif defined(OS_MAC)
1423 callq _inner_blend_n_8_lib4
1424#endif
1425#endif
1426
1427
1428 // store
1429
1430 movq ARG5, %r10 // z
1431
1432#if MACRO_LEVEL>=1
1433 INNER_STORE_8_LIB4
1434#else
1435#if defined(OS_LINUX) | defined(OS_WINDOWS)
1436 call inner_store_8_lib4
1437#elif defined(OS_MAC)
1438 callq _inner_store_8_lib4
1439#endif
1440#endif
1441
1442
1443 EPILOGUE
1444
1445 ret
1446
1447#if defined(OS_LINUX)
1448 .size kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
1449#endif
1450
1451
1452
1453
1454
1455 // read-only data
1456#if defined(OS_LINUX)
1457 .section .rodata.cst32,"aM",@progbits,32
1458#elif defined(OS_MAC)
1459 .section __TEXT,__const
1460#elif defined(OS_WINDOWS)
1461 .section .rdata,"dr"
1462#endif
1463
1464#if defined(OS_LINUX) | defined(OS_WINDOWS)
1465 .align 32
1466.LC00: // { -1 -1 -1 1 }
1467#elif defined(OS_MAC)
1468LC00: // { -1 -1 -1 1 }
1469 .align 5
1470#endif
1471 .quad -1
1472 .quad -1
1473 .quad -1
1474 .quad 1
1475
1476#if defined(OS_LINUX) | defined(OS_WINDOWS)
1477 .align 32
1478.LC01: // { -1 -1 -1 -1 }
1479#elif defined(OS_MAC)
1480LC01: // { -1 -1 -1 -1 }
1481 .align 5
1482#endif
1483 .quad -1
1484 .quad -1
1485 .quad -1
1486 .quad -1
1487
1488#if defined(OS_LINUX) | defined(OS_WINDOWS)
1489 .align 32
1490.LC02: // { 3.5 2.5 1.5 0.5 }
1491#elif defined(OS_MAC)
1492LC02: // { 3.5 2.5 1.5 0.5 }
1493 .align 5
1494#endif
1495 .long 0
1496 .long 1071644672
1497 .long 0
1498 .long 1073217536
1499 .long 0
1500 .long 1074003968
1501 .long 0
1502 .long 1074528256
1503
1504#if defined(OS_LINUX) | defined(OS_WINDOWS)
1505 .align 32
1506.LC03: // { 7.5 6.5 5.5 4.5 }
1507#elif defined(OS_MAC)
1508LC03: // { 7.5 6.5 5.5 4.5 }
1509 .align 5
1510#endif
1511 .long 0
1512 .long 1074921472
1513 .long 0
1514 .long 1075183616
1515 .long 0
1516 .long 1075445760
1517 .long 0
1518 .long 1075707904
1519
1520#if defined(OS_LINUX) | defined(OS_WINDOWS)
1521 .align 32
1522.LC04: // { 1.0 1.0 1.0 1.0 }
1523#elif defined(OS_MAC)
1524LC04: // { 1.0 1.0 1.0 1.0 }
1525 .align 5
1526#endif
1527 .long 0
1528 .long 1072693248
1529 .long 0
1530 .long 1072693248
1531 .long 0
1532 .long 1072693248
1533 .long 0
1534 .long 1072693248
1535
1536
1537
1538#if defined(OS_LINUX)
1539 .section .note.GNU-stack,"",@progbits
1540#elif defined(OS_MAC)
1541 .subsections_via_symbols
1542#endif
1543