blob: 954c96d1620f852133e172d0ff45c765927b6818 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#if defined(OS_LINUX) | defined(OS_MAC)
30
31//#define STACKSIZE 96
32#define STACKSIZE 64
33#define ARG1 %rdi
34#define ARG2 %rsi
35#define ARG3 %rdx
36#define ARG4 %rcx
37#define ARG5 %r8
38#define ARG6 %r9
39#define ARG7 STACKSIZE + 8(%rsp)
40#define ARG8 STACKSIZE + 16(%rsp)
41#define ARG9 STACKSIZE + 24(%rsp)
42#define ARG10 STACKSIZE + 32(%rsp)
43#define ARG11 STACKSIZE + 40(%rsp)
44#define ARG12 STACKSIZE + 48(%rsp)
45#define ARG13 STACKSIZE + 56(%rsp)
46#define ARG14 STACKSIZE + 64(%rsp)
47#define ARG15 STACKSIZE + 72(%rsp)
48#define ARG16 STACKSIZE + 80(%rsp)
49#define ARG17 STACKSIZE + 88(%rsp)
50#define ARG18 STACKSIZE + 96(%rsp)
51#define ARG19 STACKSIZE + 104(%rsp)
52#define PROLOGUE \
53 subq $STACKSIZE, %rsp; \
54 movq %rbx, (%rsp); \
55 movq %rbp, 8(%rsp); \
56 movq %r12, 16(%rsp); \
57 movq %r13, 24(%rsp); \
58 movq %r14, 32(%rsp); \
59 movq %r15, 40(%rsp); \
60 vzeroupper;
61#define EPILOGUE \
62 vzeroupper; \
63 movq (%rsp), %rbx; \
64 movq 8(%rsp), %rbp; \
65 movq 16(%rsp), %r12; \
66 movq 24(%rsp), %r13; \
67 movq 32(%rsp), %r14; \
68 movq 40(%rsp), %r15; \
69 addq $STACKSIZE, %rsp;
70
71#elif defined(OS_WINDOWS)
72
73#define STACKSIZE 256
74#define ARG1 %rcx
75#define ARG2 %rdx
76#define ARG3 %r8
77#define ARG4 %r9
78#define ARG5 STACKSIZE + 40(%rsp)
79#define ARG6 STACKSIZE + 48(%rsp)
80#define ARG7 STACKSIZE + 56(%rsp)
81#define ARG8 STACKSIZE + 64(%rsp)
82#define ARG9 STACKSIZE + 72(%rsp)
83#define ARG10 STACKSIZE + 80(%rsp)
84#define ARG11 STACKSIZE + 88(%rsp)
85#define ARG12 STACKSIZE + 96(%rsp)
86#define ARG13 STACKSIZE + 104(%rsp)
87#define ARG14 STACKSIZE + 112(%rsp)
88#define ARG15 STACKSIZE + 120(%rsp)
89#define ARG16 STACKSIZE + 128(%rsp)
90#define ARG17 STACKSIZE + 136(%rsp)
91#define ARG18 STACKSIZE + 144(%rsp)
92#define ARG19 STACKSIZE + 152(%rsp)
93#define PROLOGUE \
94 subq $STACKSIZE, %rsp; \
95 movq %rbx, (%rsp); \
96 movq %rbp, 8(%rsp); \
97 movq %r12, 16(%rsp); \
98 movq %r13, 24(%rsp); \
99 movq %r14, 32(%rsp); \
100 movq %r15, 40(%rsp); \
101 movq %rdi, 48(%rsp); \
102 movq %rsi, 56(%rsp); \
103 vmovups %xmm6, 64(%rsp); \
104 vmovups %xmm7, 80(%rsp); \
105 vmovups %xmm8, 96(%rsp); \
106 vmovups %xmm9, 112(%rsp); \
107 vmovups %xmm10, 128(%rsp); \
108 vmovups %xmm11, 144(%rsp); \
109 vmovups %xmm12, 160(%rsp); \
110 vmovups %xmm13, 176(%rsp); \
111 vmovups %xmm14, 192(%rsp); \
112 vmovups %xmm15, 208(%rsp); \
113 vzeroupper;
114#define EPILOGUE \
115 vzeroupper; \
116 movq (%rsp), %rbx; \
117 movq 8(%rsp), %rbp; \
118 movq 16(%rsp), %r12; \
119 movq 24(%rsp), %r13; \
120 movq 32(%rsp), %r14; \
121 movq 40(%rsp), %r15; \
122 movq 48(%rsp), %rdi; \
123 movq 56(%rsp), %rsi; \
124 vmovups 64(%rsp), %xmm6; \
125 vmovups 80(%rsp), %xmm7; \
126 vmovups 96(%rsp), %xmm8; \
127 vmovups 112(%rsp), %xmm9; \
128 vmovups 128(%rsp), %xmm10; \
129 vmovups 144(%rsp), %xmm11; \
130 vmovups 160(%rsp), %xmm12; \
131 vmovups 176(%rsp), %xmm13; \
132 vmovups 192(%rsp), %xmm14; \
133 vmovups 208(%rsp), %xmm15; \
134 addq $STACKSIZE, %rsp;
135
136#else
137
138#error wrong OS
139
140#endif
141
142
143
144#if defined(OS_LINUX) | defined(OS_WINDOWS)
145 .text
146#elif defined(OS_MAC)
147 .section __TEXT,__text,regular,pure_instructions
148#endif
149
150
151
152
153
154// common inner routine with file scope
155//
156// input arguments:
157// r10d <- k
158// r11 <- A
159// r12 <- 4*sda*sizeof(double)
160// r13 <- B
161// r14 <- 4*sdb*sizeof(double)
162// r15 <- dirty
163// ymm0 <- [d00 d11 d22 d33]
164// ymm1 <- [d01 d10 d23 d32]
165// ymm2 <- [d03 d12 d21 d30]
166// ymm3 <- [d02 d13 d20 d31]
167// ymm4 <- [d40 d51 d62 d73]
168// ymm5 <- [d41 d50 d63 d72]
169// ymm6 <- [d43 d52 d61 d70]
170// ymm7 <- [d42 d53 d60 d71]
171// ymm8 <- dirty
172// ymm9 <- dirty
173// ymm10 <- dirty
174// ymm11 <- dirty
175// ymm12 <- dirty
176// ymm13 <- dirty
177// ymm14 <- dirty
178// ymm15 <- dirty
179
180//
181// output arguments:
182// r10d <- 0
183// r11 <- A+4*k*sizeof(double)
184// r12 <- 4*sda*sizeof(double)
185// r13 <- B+4*k*sizeof(double)
186// r14 <- 4*sdb*sizeof(double)
187// r15 <- dirty
188// ymm0 <- [d00 d11 d22 d33]
189// ymm1 <- [d01 d10 d23 d32]
190// ymm2 <- [d03 d12 d21 d30]
191// ymm3 <- [d02 d13 d20 d31]
192// ymm4 <- [d40 d51 d62 d73]
193// ymm5 <- [d41 d50 d63 d72]
194// ymm6 <- [d43 d52 d61 d70]
195// ymm7 <- [d42 d53 d60 d71]
196// ymm8 <- dirty
197// ymm9 <- dirty
198// ymm10 <- dirty
199// ymm11 <- dirty
200// ymm12 <- dirty
201// ymm13 <- dirty
202// ymm14 <- dirty
203// ymm15 <- dirty
204
205#if MACRO_LEVEL>=2
206 .macro INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
207#else
208 .p2align 4,,15
209#if defined(OS_LINUX)
210 .type inner_kernel_dgemm_add_nt_8x8_lib4, @function
211inner_kernel_dgemm_add_nt_8x8_lib4:
212#elif defined(OS_MAC)
213_inner_kernel_dgemm_add_nt_8x8_lib4:
214#elif defined(OS_WINDOWS)
215 .def inner_kernel_dgemm_add_nt_8x8_lib4; .scl 2; .type 32; .endef
216inner_kernel_dgemm_add_nt_8x8_lib4:
217#endif
218#endif
219
220 cmpl $0, %r10d
221 jle 2f // return
222
223 // preload
224 vmovapd 0(%r11), %ymm12
225 vmovapd 0(%r11, %r12, 1), %ymm13
226 vbroadcastsd 0(%r13), %ymm14
227 vbroadcastsd 0(%r13, %r14, 1), %ymm15
228
229 cmpl $4, %r10d
230 jle 0f // consider clean-up loop
231
232 // main loop
233 .p2align 3
2341: // main loop
235
236 // unroll 0
237 vfmadd231pd %ymm12, %ymm14, %ymm0
238 subl $4, %r10d
239 vfmadd231pd %ymm13, %ymm14, %ymm4
240 vbroadcastsd 8(%r13), %ymm14
241 vfmadd231pd %ymm13, %ymm15, %ymm8
242 vbroadcastsd 8(%r13, %r14, 1), %ymm15
243
244 vfmadd231pd %ymm12, %ymm14, %ymm1
245 vfmadd231pd %ymm13, %ymm14, %ymm5
246 vbroadcastsd 16(%r13), %ymm14
247 vfmadd231pd %ymm13, %ymm15, %ymm9
248 vbroadcastsd 16(%r13, %r14, 1), %ymm15
249
250 vfmadd231pd %ymm12, %ymm14, %ymm2
251 vfmadd231pd %ymm13, %ymm14, %ymm6
252 vbroadcastsd 24(%r13), %ymm14
253 vfmadd231pd %ymm13, %ymm15, %ymm10
254 vbroadcastsd 24(%r13, %r14, 1), %ymm15
255
256 vfmadd231pd %ymm12, %ymm14, %ymm3
257 vmovapd 32(%r11), %ymm12
258 vfmadd231pd %ymm13, %ymm14, %ymm7
259 vbroadcastsd 32(%r13), %ymm14
260 vfmadd231pd %ymm13, %ymm15, %ymm11
261 vmovapd 32(%r11, %r12, 1), %ymm13
262 vbroadcastsd 32(%r13, %r14, 1), %ymm15
263
264 // unroll 1
265 vfmadd231pd %ymm12, %ymm14, %ymm0
266 vfmadd231pd %ymm13, %ymm14, %ymm4
267 vbroadcastsd 40(%r13), %ymm14
268 vfmadd231pd %ymm13, %ymm15, %ymm8
269 vbroadcastsd 40(%r13, %r14, 1), %ymm15
270
271 vfmadd231pd %ymm12, %ymm14, %ymm1
272 vfmadd231pd %ymm13, %ymm14, %ymm5
273 vbroadcastsd 48(%r13), %ymm14
274 vfmadd231pd %ymm13, %ymm15, %ymm9
275 vbroadcastsd 48(%r13, %r14, 1), %ymm15
276
277 vfmadd231pd %ymm12, %ymm14, %ymm2
278 vfmadd231pd %ymm13, %ymm14, %ymm6
279 vbroadcastsd 56(%r13), %ymm14
280 vfmadd231pd %ymm13, %ymm15, %ymm10
281 vbroadcastsd 56(%r13, %r14, 1), %ymm15
282
283 vfmadd231pd %ymm12, %ymm14, %ymm3
284 vmovapd 64(%r11), %ymm12
285 vfmadd231pd %ymm13, %ymm14, %ymm7
286 vbroadcastsd 64(%r13), %ymm14
287 vfmadd231pd %ymm13, %ymm15, %ymm11
288 vmovapd 64(%r11, %r12, 1), %ymm13
289 vbroadcastsd 64(%r13, %r14, 1), %ymm15
290
291 // unroll 2
292 vfmadd231pd %ymm12, %ymm14, %ymm0
293 vfmadd231pd %ymm13, %ymm14, %ymm4
294 vbroadcastsd 72(%r13), %ymm14
295 vfmadd231pd %ymm13, %ymm15, %ymm8
296 vbroadcastsd 72(%r13, %r14, 1), %ymm15
297
298 vfmadd231pd %ymm12, %ymm14, %ymm1
299 vfmadd231pd %ymm13, %ymm14, %ymm5
300 vbroadcastsd 80(%r13), %ymm14
301 vfmadd231pd %ymm13, %ymm15, %ymm9
302 vbroadcastsd 80(%r13, %r14, 1), %ymm15
303
304 vfmadd231pd %ymm12, %ymm14, %ymm2
305 vfmadd231pd %ymm13, %ymm14, %ymm6
306 vbroadcastsd 88(%r13), %ymm14
307 vfmadd231pd %ymm13, %ymm15, %ymm10
308 vbroadcastsd 88(%r13, %r14, 1), %ymm15
309
310 vfmadd231pd %ymm12, %ymm14, %ymm3
311 vmovapd 96(%r11), %ymm12
312 vfmadd231pd %ymm13, %ymm14, %ymm7
313 vbroadcastsd 96(%r13), %ymm14
314 vfmadd231pd %ymm13, %ymm15, %ymm11
315 vmovapd 96(%r11, %r12, 1), %ymm13
316 vbroadcastsd 96(%r13, %r14, 1), %ymm15
317
318 // unroll 3
319 vfmadd231pd %ymm12, %ymm14, %ymm0
320 vfmadd231pd %ymm13, %ymm14, %ymm4
321 vbroadcastsd 104(%r13), %ymm14
322 vfmadd231pd %ymm13, %ymm15, %ymm8
323 vbroadcastsd 104(%r13, %r14, 1), %ymm15
324 addq $128, %r11
325
326 vfmadd231pd %ymm12, %ymm14, %ymm1
327 vfmadd231pd %ymm13, %ymm14, %ymm5
328 vbroadcastsd 112(%r13), %ymm14
329 vfmadd231pd %ymm13, %ymm15, %ymm9
330 vbroadcastsd 112(%r13, %r14, 1), %ymm15
331
332 vfmadd231pd %ymm12, %ymm14, %ymm2
333 vfmadd231pd %ymm13, %ymm14, %ymm6
334 vbroadcastsd 120(%r13), %ymm14
335 vfmadd231pd %ymm13, %ymm15, %ymm10
336 vbroadcastsd 120(%r13, %r14, 1), %ymm15
337 addq $128, %r13
338
339 vfmadd231pd %ymm12, %ymm14, %ymm3
340 vmovapd 0(%r11), %ymm12
341 vfmadd231pd %ymm13, %ymm14, %ymm7
342 vbroadcastsd 0(%r13), %ymm14
343 vfmadd231pd %ymm13, %ymm15, %ymm11
344 vmovapd 0(%r11, %r12, 1), %ymm13
345 vbroadcastsd 0(%r13, %r14, 1), %ymm15
346
347 cmpl $4, %r10d
348 jg 1b // main loop
349
350
3510: // consider clean4-up
352
353 cmpl $3, %r10d
354 jle 4f // clean1
355
356 // unroll 0
357 vfmadd231pd %ymm12, %ymm14, %ymm0
358 subl $4, %r10d
359 vfmadd231pd %ymm13, %ymm14, %ymm4
360 vbroadcastsd 8(%r13), %ymm14
361 vfmadd231pd %ymm13, %ymm15, %ymm8
362 vbroadcastsd 8(%r13, %r14, 1), %ymm15
363
364 vfmadd231pd %ymm12, %ymm14, %ymm1
365 vfmadd231pd %ymm13, %ymm14, %ymm5
366 vbroadcastsd 16(%r13), %ymm14
367 vfmadd231pd %ymm13, %ymm15, %ymm9
368 vbroadcastsd 16(%r13, %r14, 1), %ymm15
369
370 vfmadd231pd %ymm12, %ymm14, %ymm2
371 vfmadd231pd %ymm13, %ymm14, %ymm6
372 vbroadcastsd 24(%r13), %ymm14
373 vfmadd231pd %ymm13, %ymm15, %ymm10
374 vbroadcastsd 24(%r13, %r14, 1), %ymm15
375
376 vfmadd231pd %ymm12, %ymm14, %ymm3
377 vmovapd 32(%r11), %ymm12
378 vfmadd231pd %ymm13, %ymm14, %ymm7
379 vbroadcastsd 32(%r13), %ymm14
380 vfmadd231pd %ymm13, %ymm15, %ymm11
381 vmovapd 32(%r11, %r12, 1), %ymm13
382 vbroadcastsd 32(%r13, %r14, 1), %ymm15
383
384 // unroll 1
385 vfmadd231pd %ymm12, %ymm14, %ymm0
386 vfmadd231pd %ymm13, %ymm14, %ymm4
387 vbroadcastsd 40(%r13), %ymm14
388 vfmadd231pd %ymm13, %ymm15, %ymm8
389 vbroadcastsd 40(%r13, %r14, 1), %ymm15
390
391 vfmadd231pd %ymm12, %ymm14, %ymm1
392 vfmadd231pd %ymm13, %ymm14, %ymm5
393 vbroadcastsd 48(%r13), %ymm14
394 vfmadd231pd %ymm13, %ymm15, %ymm9
395 vbroadcastsd 48(%r13, %r14, 1), %ymm15
396
397 vfmadd231pd %ymm12, %ymm14, %ymm2
398 vfmadd231pd %ymm13, %ymm14, %ymm6
399 vbroadcastsd 56(%r13), %ymm14
400 vfmadd231pd %ymm13, %ymm15, %ymm10
401 vbroadcastsd 56(%r13, %r14, 1), %ymm15
402
403 vfmadd231pd %ymm12, %ymm14, %ymm3
404 vmovapd 64(%r11), %ymm12
405 vfmadd231pd %ymm13, %ymm14, %ymm7
406 vbroadcastsd 64(%r13), %ymm14
407 vfmadd231pd %ymm13, %ymm15, %ymm11
408 vmovapd 64(%r11, %r12, 1), %ymm13
409 vbroadcastsd 64(%r13, %r14, 1), %ymm15
410
411 // unroll 2
412 vfmadd231pd %ymm12, %ymm14, %ymm0
413 vfmadd231pd %ymm13, %ymm14, %ymm4
414 vbroadcastsd 72(%r13), %ymm14
415 vfmadd231pd %ymm13, %ymm15, %ymm8
416 vbroadcastsd 72(%r13, %r14, 1), %ymm15
417
418 vfmadd231pd %ymm12, %ymm14, %ymm1
419 vfmadd231pd %ymm13, %ymm14, %ymm5
420 vbroadcastsd 80(%r13), %ymm14
421 vfmadd231pd %ymm13, %ymm15, %ymm9
422 vbroadcastsd 80(%r13, %r14, 1), %ymm15
423
424 vfmadd231pd %ymm12, %ymm14, %ymm2
425 vfmadd231pd %ymm13, %ymm14, %ymm6
426 vbroadcastsd 88(%r13), %ymm14
427 vfmadd231pd %ymm13, %ymm15, %ymm10
428 vbroadcastsd 88(%r13, %r14, 1), %ymm15
429
430 vfmadd231pd %ymm12, %ymm14, %ymm3
431 vmovapd 96(%r11), %ymm12
432 vfmadd231pd %ymm13, %ymm14, %ymm7
433 vbroadcastsd 96(%r13), %ymm14
434 vfmadd231pd %ymm13, %ymm15, %ymm11
435 vmovapd 96(%r11, %r12, 1), %ymm13
436 vbroadcastsd 96(%r13, %r14, 1), %ymm15
437
438 // unroll 3
439 vfmadd231pd %ymm12, %ymm14, %ymm0
440 vfmadd231pd %ymm13, %ymm14, %ymm4
441 vbroadcastsd 104(%r13), %ymm14
442 vfmadd231pd %ymm13, %ymm15, %ymm8
443 vbroadcastsd 104(%r13, %r14, 1), %ymm15
444 addq $128, %r11
445
446 vfmadd231pd %ymm12, %ymm14, %ymm1
447 vfmadd231pd %ymm13, %ymm14, %ymm5
448 vbroadcastsd 112(%r13), %ymm14
449 vfmadd231pd %ymm13, %ymm15, %ymm9
450 vbroadcastsd 112(%r13, %r14, 1), %ymm15
451
452 vfmadd231pd %ymm12, %ymm14, %ymm2
453 vfmadd231pd %ymm13, %ymm14, %ymm6
454 vbroadcastsd 120(%r13), %ymm14
455 vfmadd231pd %ymm13, %ymm15, %ymm10
456 vbroadcastsd 120(%r13, %r14, 1), %ymm15
457 addq $128, %r13
458
459 vfmadd231pd %ymm12, %ymm14, %ymm3
460// vmovapd 0(%r11), %ymm12
461 vfmadd231pd %ymm13, %ymm14, %ymm7
462// vbroadcastsd 0(%r13), %ymm14
463 vfmadd231pd %ymm13, %ymm15, %ymm11
464// vmovapd 0(%r11, %r12, 1), %ymm13
465// vbroadcastsd 0(%r13, %r14, 1), %ymm15
466
467 jmp 2f
468
469
4704: // consider clean1-up loop
471
472 cmpl $0, %r10d
473 jle 2f // return
474
475 // clean-up loop
4763: // clean up loop
477
478 // unroll 0
479 vmovapd 0(%r11), %ymm12
480 vmovapd 0(%r11, %r12, 1), %ymm13
481 vbroadcastsd 0(%r13), %ymm14
482 vfmadd231pd %ymm12, %ymm14, %ymm0
483 vfmadd231pd %ymm13, %ymm14, %ymm4
484 vbroadcastsd 0(%r13, %r14, 1), %ymm15
485 vfmadd231pd %ymm13, %ymm15, %ymm8
486 subl $1, %r10d
487
488 vbroadcastsd 8(%r13), %ymm14
489 vfmadd231pd %ymm12, %ymm14, %ymm1
490 vfmadd231pd %ymm13, %ymm14, %ymm5
491 vbroadcastsd 8(%r13, %r14, 1), %ymm15
492 vfmadd231pd %ymm13, %ymm15, %ymm9
493 addq $32, %r11
494
495 vbroadcastsd 16(%r13), %ymm14
496 vfmadd231pd %ymm12, %ymm14, %ymm2
497 vfmadd231pd %ymm13, %ymm14, %ymm6
498 vbroadcastsd 16(%r13, %r14, 1), %ymm15
499 vfmadd231pd %ymm13, %ymm15, %ymm10
500 addq $32, %r13
501
502 vbroadcastsd -8(%r13), %ymm14
503 vfmadd231pd %ymm12, %ymm14, %ymm3
504 vfmadd231pd %ymm13, %ymm14, %ymm7
505 vbroadcastsd -8(%r13, %r14, 1), %ymm15
506 vfmadd231pd %ymm13, %ymm15, %ymm11
507
508 cmpl $0, %r10d
509 jg 3b // clean up loop
510
511
5122: // return
513
514#if MACRO_LEVEL>=2
515 .endm
516#else
517 ret
518
519#if defined(OS_LINUX)
520 .size inner_kernel_dgemm_add_nt_8x8_lib4, .-inner_kernel_dgemm_add_nt_8x8_lib4
521#endif
522#endif
523
524
525
526
527
528// common inner routine with file scope
529//
530// input arguments:
531// r10d <- k
532// r11 <- A
533// r12 <- 4*sda*sizeof(double)
534// r13 <- B
535// r14 <- 4*sdb*sizeof(double)
536// r15 <- dirty
537// ymm0 <- [d00 d11 d22 d33]
538// ymm1 <- [d01 d10 d23 d32]
539// ymm2 <- [d03 d12 d21 d30]
540// ymm3 <- [d02 d13 d20 d31]
541// ymm4 <- [d40 d51 d62 d73]
542// ymm5 <- [d41 d50 d63 d72]
543// ymm6 <- [d43 d52 d61 d70]
544// ymm7 <- [d42 d53 d60 d71]
545// ymm8 <- dirty
546// ymm9 <- dirty
547// ymm10 <- dirty
548// ymm11 <- dirty
549// ymm12 <- dirty
550// ymm13 <- dirty
551// ymm14 <- dirty
552// ymm15 <- dirty
553
554//
555// output arguments:
556// r10d <- 0
557// r11 <- A+4*k*sizeof(double)
558// r12 <- 4*sda*sizeof(double)
559// r13 <- B+4*k*sizeof(double)
560// r14 <- 4*sdb*sizeof(double)
561// r15 <- dirty
562// ymm0 <- [d00 d11 d22 d33]
563// ymm1 <- [d01 d10 d23 d32]
564// ymm2 <- [d03 d12 d21 d30]
565// ymm3 <- [d02 d13 d20 d31]
566// ymm4 <- [d40 d51 d62 d73]
567// ymm5 <- [d41 d50 d63 d72]
568// ymm6 <- [d43 d52 d61 d70]
569// ymm7 <- [d42 d53 d60 d71]
570// ymm8 <- dirty
571// ymm9 <- dirty
572// ymm10 <- dirty
573// ymm11 <- dirty
574// ymm12 <- dirty
575// ymm13 <- dirty
576// ymm14 <- dirty
577// ymm15 <- dirty
578
579#if MACRO_LEVEL>=2
580 .macro INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
581#else
582 .p2align 4,,15
583#if defined(OS_LINUX)
584 .type inner_kernel_dgemm_sub_nt_8x8_lib4, @function
585inner_kernel_dgemm_sub_nt_8x8_lib4:
586#elif defined(OS_MAC)
587_inner_kernel_dgemm_sub_nt_8x8_lib4:
588#elif defined(OS_WINDOWS)
589 .def inner_kernel_dgemm_sub_nt_8x8_lib4; .scl 2; .type 32; .endef
590inner_kernel_dgemm_sub_nt_8x8_lib4:
591#endif
592#endif
593
594 cmpl $0, %r10d
595 jle 2f // return
596
597 // preload
598 vmovapd 0(%r11), %ymm12
599 vmovapd 0(%r11, %r12, 1), %ymm13
600 vbroadcastsd 0(%r13), %ymm14
601 vbroadcastsd 0(%r13, %r14, 1), %ymm15
602
603 cmpl $4, %r10d
604 jle 0f // consider clean-up loop
605
606 // main loop
607 .p2align 3
6081: // main loop
609
610 // unroll 0
611 vfnmadd231pd %ymm12, %ymm14, %ymm0
612 subl $4, %r10d
613 vfnmadd231pd %ymm13, %ymm14, %ymm4
614 vbroadcastsd 8(%r13), %ymm14
615 vfnmadd231pd %ymm13, %ymm15, %ymm8
616 vbroadcastsd 8(%r13, %r14, 1), %ymm15
617
618 vfnmadd231pd %ymm12, %ymm14, %ymm1
619 vfnmadd231pd %ymm13, %ymm14, %ymm5
620 vbroadcastsd 16(%r13), %ymm14
621 vfnmadd231pd %ymm13, %ymm15, %ymm9
622 vbroadcastsd 16(%r13, %r14, 1), %ymm15
623
624 vfnmadd231pd %ymm12, %ymm14, %ymm2
625 vfnmadd231pd %ymm13, %ymm14, %ymm6
626 vbroadcastsd 24(%r13), %ymm14
627 vfnmadd231pd %ymm13, %ymm15, %ymm10
628 vbroadcastsd 24(%r13, %r14, 1), %ymm15
629
630 vfnmadd231pd %ymm12, %ymm14, %ymm3
631 vmovapd 32(%r11), %ymm12
632 vfnmadd231pd %ymm13, %ymm14, %ymm7
633 vbroadcastsd 32(%r13), %ymm14
634 vfnmadd231pd %ymm13, %ymm15, %ymm11
635 vmovapd 32(%r11, %r12, 1), %ymm13
636 vbroadcastsd 32(%r13, %r14, 1), %ymm15
637
638 // unroll 1
639 vfnmadd231pd %ymm12, %ymm14, %ymm0
640 vfnmadd231pd %ymm13, %ymm14, %ymm4
641 vbroadcastsd 40(%r13), %ymm14
642 vfnmadd231pd %ymm13, %ymm15, %ymm8
643 vbroadcastsd 40(%r13, %r14, 1), %ymm15
644
645 vfnmadd231pd %ymm12, %ymm14, %ymm1
646 vfnmadd231pd %ymm13, %ymm14, %ymm5
647 vbroadcastsd 48(%r13), %ymm14
648 vfnmadd231pd %ymm13, %ymm15, %ymm9
649 vbroadcastsd 48(%r13, %r14, 1), %ymm15
650
651 vfnmadd231pd %ymm12, %ymm14, %ymm2
652 vfnmadd231pd %ymm13, %ymm14, %ymm6
653 vbroadcastsd 56(%r13), %ymm14
654 vfnmadd231pd %ymm13, %ymm15, %ymm10
655 vbroadcastsd 56(%r13, %r14, 1), %ymm15
656
657 vfnmadd231pd %ymm12, %ymm14, %ymm3
658 vmovapd 64(%r11), %ymm12
659 vfnmadd231pd %ymm13, %ymm14, %ymm7
660 vbroadcastsd 64(%r13), %ymm14
661 vfnmadd231pd %ymm13, %ymm15, %ymm11
662 vmovapd 64(%r11, %r12, 1), %ymm13
663 vbroadcastsd 64(%r13, %r14, 1), %ymm15
664
665 // unroll 2
666 vfnmadd231pd %ymm12, %ymm14, %ymm0
667 vfnmadd231pd %ymm13, %ymm14, %ymm4
668 vbroadcastsd 72(%r13), %ymm14
669 vfnmadd231pd %ymm13, %ymm15, %ymm8
670 vbroadcastsd 72(%r13, %r14, 1), %ymm15
671
672 vfnmadd231pd %ymm12, %ymm14, %ymm1
673 vfnmadd231pd %ymm13, %ymm14, %ymm5
674 vbroadcastsd 80(%r13), %ymm14
675 vfnmadd231pd %ymm13, %ymm15, %ymm9
676 vbroadcastsd 80(%r13, %r14, 1), %ymm15
677
678 vfnmadd231pd %ymm12, %ymm14, %ymm2
679 vfnmadd231pd %ymm13, %ymm14, %ymm6
680 vbroadcastsd 88(%r13), %ymm14
681 vfnmadd231pd %ymm13, %ymm15, %ymm10
682 vbroadcastsd 88(%r13, %r14, 1), %ymm15
683
684 vfnmadd231pd %ymm12, %ymm14, %ymm3
685 vmovapd 96(%r11), %ymm12
686 vfnmadd231pd %ymm13, %ymm14, %ymm7
687 vbroadcastsd 96(%r13), %ymm14
688 vfnmadd231pd %ymm13, %ymm15, %ymm11
689 vmovapd 96(%r11, %r12, 1), %ymm13
690 vbroadcastsd 96(%r13, %r14, 1), %ymm15
691
692 // unroll 3
693 vfnmadd231pd %ymm12, %ymm14, %ymm0
694 vfnmadd231pd %ymm13, %ymm14, %ymm4
695 vbroadcastsd 104(%r13), %ymm14
696 vfnmadd231pd %ymm13, %ymm15, %ymm8
697 vbroadcastsd 104(%r13, %r14, 1), %ymm15
698 addq $128, %r11
699
700 vfnmadd231pd %ymm12, %ymm14, %ymm1
701 vfnmadd231pd %ymm13, %ymm14, %ymm5
702 vbroadcastsd 112(%r13), %ymm14
703 vfnmadd231pd %ymm13, %ymm15, %ymm9
704 vbroadcastsd 112(%r13, %r14, 1), %ymm15
705
706 vfnmadd231pd %ymm12, %ymm14, %ymm2
707 vfnmadd231pd %ymm13, %ymm14, %ymm6
708 vbroadcastsd 120(%r13), %ymm14
709 vfnmadd231pd %ymm13, %ymm15, %ymm10
710 vbroadcastsd 120(%r13, %r14, 1), %ymm15
711 addq $128, %r13
712
713 vfnmadd231pd %ymm12, %ymm14, %ymm3
714 vmovapd 0(%r11), %ymm12
715 vfnmadd231pd %ymm13, %ymm14, %ymm7
716 vbroadcastsd 0(%r13), %ymm14
717 vfnmadd231pd %ymm13, %ymm15, %ymm11
718 vmovapd 0(%r11, %r12, 1), %ymm13
719 vbroadcastsd 0(%r13, %r14, 1), %ymm15
720
721 cmpl $4, %r10d
722 jg 1b // main loop
723
724
7250: // consider clean4-up
726
727 cmpl $3, %r10d
728 jle 4f // clean1
729
730 // unroll 0
731 vfnmadd231pd %ymm12, %ymm14, %ymm0
732 subl $4, %r10d
733 vfnmadd231pd %ymm13, %ymm14, %ymm4
734 vbroadcastsd 8(%r13), %ymm14
735 vfnmadd231pd %ymm13, %ymm15, %ymm8
736 vbroadcastsd 8(%r13, %r14, 1), %ymm15
737
738 vfnmadd231pd %ymm12, %ymm14, %ymm1
739 vfnmadd231pd %ymm13, %ymm14, %ymm5
740 vbroadcastsd 16(%r13), %ymm14
741 vfnmadd231pd %ymm13, %ymm15, %ymm9
742 vbroadcastsd 16(%r13, %r14, 1), %ymm15
743
744 vfnmadd231pd %ymm12, %ymm14, %ymm2
745 vfnmadd231pd %ymm13, %ymm14, %ymm6
746 vbroadcastsd 24(%r13), %ymm14
747 vfnmadd231pd %ymm13, %ymm15, %ymm10
748 vbroadcastsd 24(%r13, %r14, 1), %ymm15
749
750 vfnmadd231pd %ymm12, %ymm14, %ymm3
751 vmovapd 32(%r11), %ymm12
752 vfnmadd231pd %ymm13, %ymm14, %ymm7
753 vbroadcastsd 32(%r13), %ymm14
754 vfnmadd231pd %ymm13, %ymm15, %ymm11
755 vmovapd 32(%r11, %r12, 1), %ymm13
756 vbroadcastsd 32(%r13, %r14, 1), %ymm15
757
758 // unroll 1
759 vfnmadd231pd %ymm12, %ymm14, %ymm0
760 vfnmadd231pd %ymm13, %ymm14, %ymm4
761 vbroadcastsd 40(%r13), %ymm14
762 vfnmadd231pd %ymm13, %ymm15, %ymm8
763 vbroadcastsd 40(%r13, %r14, 1), %ymm15
764
765 vfnmadd231pd %ymm12, %ymm14, %ymm1
766 vfnmadd231pd %ymm13, %ymm14, %ymm5
767 vbroadcastsd 48(%r13), %ymm14
768 vfnmadd231pd %ymm13, %ymm15, %ymm9
769 vbroadcastsd 48(%r13, %r14, 1), %ymm15
770
771 vfnmadd231pd %ymm12, %ymm14, %ymm2
772 vfnmadd231pd %ymm13, %ymm14, %ymm6
773 vbroadcastsd 56(%r13), %ymm14
774 vfnmadd231pd %ymm13, %ymm15, %ymm10
775 vbroadcastsd 56(%r13, %r14, 1), %ymm15
776
777 vfnmadd231pd %ymm12, %ymm14, %ymm3
778 vmovapd 64(%r11), %ymm12
779 vfnmadd231pd %ymm13, %ymm14, %ymm7
780 vbroadcastsd 64(%r13), %ymm14
781 vfnmadd231pd %ymm13, %ymm15, %ymm11
782 vmovapd 64(%r11, %r12, 1), %ymm13
783 vbroadcastsd 64(%r13, %r14, 1), %ymm15
784
785 // unroll 2
786 vfnmadd231pd %ymm12, %ymm14, %ymm0
787 vfnmadd231pd %ymm13, %ymm14, %ymm4
788 vbroadcastsd 72(%r13), %ymm14
789 vfnmadd231pd %ymm13, %ymm15, %ymm8
790 vbroadcastsd 72(%r13, %r14, 1), %ymm15
791
792 vfnmadd231pd %ymm12, %ymm14, %ymm1
793 vfnmadd231pd %ymm13, %ymm14, %ymm5
794 vbroadcastsd 80(%r13), %ymm14
795 vfnmadd231pd %ymm13, %ymm15, %ymm9
796 vbroadcastsd 80(%r13, %r14, 1), %ymm15
797
798 vfnmadd231pd %ymm12, %ymm14, %ymm2
799 vfnmadd231pd %ymm13, %ymm14, %ymm6
800 vbroadcastsd 88(%r13), %ymm14
801 vfnmadd231pd %ymm13, %ymm15, %ymm10
802 vbroadcastsd 88(%r13, %r14, 1), %ymm15
803
804 vfnmadd231pd %ymm12, %ymm14, %ymm3
805 vmovapd 96(%r11), %ymm12
806 vfnmadd231pd %ymm13, %ymm14, %ymm7
807 vbroadcastsd 96(%r13), %ymm14
808 vfnmadd231pd %ymm13, %ymm15, %ymm11
809 vmovapd 96(%r11, %r12, 1), %ymm13
810 vbroadcastsd 96(%r13, %r14, 1), %ymm15
811
812 // unroll 3
813 vfnmadd231pd %ymm12, %ymm14, %ymm0
814 vfnmadd231pd %ymm13, %ymm14, %ymm4
815 vbroadcastsd 104(%r13), %ymm14
816 vfnmadd231pd %ymm13, %ymm15, %ymm8
817 vbroadcastsd 104(%r13, %r14, 1), %ymm15
818 addq $128, %r11
819
820 vfnmadd231pd %ymm12, %ymm14, %ymm1
821 vfnmadd231pd %ymm13, %ymm14, %ymm5
822 vbroadcastsd 112(%r13), %ymm14
823 vfnmadd231pd %ymm13, %ymm15, %ymm9
824 vbroadcastsd 112(%r13, %r14, 1), %ymm15
825
826 vfnmadd231pd %ymm12, %ymm14, %ymm2
827 vfnmadd231pd %ymm13, %ymm14, %ymm6
828 vbroadcastsd 120(%r13), %ymm14
829 vfnmadd231pd %ymm13, %ymm15, %ymm10
830 vbroadcastsd 120(%r13, %r14, 1), %ymm15
831 addq $128, %r13
832
833 vfnmadd231pd %ymm12, %ymm14, %ymm3
834// vmovapd 0(%r11), %ymm12
835 vfnmadd231pd %ymm13, %ymm14, %ymm7
836// vbroadcastsd 0(%r13), %ymm14
837 vfnmadd231pd %ymm13, %ymm15, %ymm11
838// vmovapd 0(%r11, %r12, 1), %ymm13
839// vbroadcastsd 0(%r13, %r14, 1), %ymm15
840
841 jmp 2f
842
843
8444: // consider clean1-up loop
845
846 cmpl $0, %r10d
847 jle 2f // return
848
849 // clean-up loop
8503: // clean up loop
851
852 // unroll 0
853 vmovapd 0(%r11), %ymm12
854 vmovapd 0(%r11, %r12, 1), %ymm13
855 vbroadcastsd 0(%r13), %ymm14
856 vfnmadd231pd %ymm12, %ymm14, %ymm0
857 vfnmadd231pd %ymm13, %ymm14, %ymm4
858 vbroadcastsd 0(%r13, %r14, 1), %ymm15
859 vfnmadd231pd %ymm13, %ymm15, %ymm8
860 subl $1, %r10d
861
862 vbroadcastsd 8(%r13), %ymm14
863 vfnmadd231pd %ymm12, %ymm14, %ymm1
864 vfnmadd231pd %ymm13, %ymm14, %ymm5
865 vbroadcastsd 8(%r13, %r14, 1), %ymm15
866 vfnmadd231pd %ymm13, %ymm15, %ymm9
867 addq $32, %r11
868
869 vbroadcastsd 16(%r13), %ymm14
870 vfnmadd231pd %ymm12, %ymm14, %ymm2
871 vfnmadd231pd %ymm13, %ymm14, %ymm6
872 vbroadcastsd 16(%r13, %r14, 1), %ymm15
873 vfnmadd231pd %ymm13, %ymm15, %ymm10
874 addq $32, %r13
875
876 vbroadcastsd -8(%r13), %ymm14
877 vfnmadd231pd %ymm12, %ymm14, %ymm3
878 vfnmadd231pd %ymm13, %ymm14, %ymm7
879 vbroadcastsd -8(%r13, %r14, 1), %ymm15
880 vfnmadd231pd %ymm13, %ymm15, %ymm11
881
882 cmpl $0, %r10d
883 jg 3b // clean up loop
884
885
8862: // return
887
888#if MACRO_LEVEL>=2
889 .endm
890#else
891 ret
892
893#if defined(OS_LINUX)
894 .size inner_kernel_dgemm_sub_nt_8x8_lib4, .-inner_kernel_dgemm_sub_nt_8x8_lib4
895#endif
896#endif
897
898
899
900
901
902// common inner routine with file scope
903//
904// scale for generic alpha and beta
905//
906// input arguments:
907// r10 <- &alpha
908// r11 <- &beta
909// r12 <- C
910// r13 <- 4*sdc*sizeof(double)
911// r14 <- dirty
912// r15 <- dirty
913// ymm0 <- [d00 d11 d22 d33]
914// ymm1 <- [d01 d10 d23 d32]
915// ymm2 <- [d03 d12 d21 d30]
916// ymm3 <- [d02 d13 d20 d31]
917// ymm4 <- [d40 d51 d62 d73]
918// ymm5 <- [d41 d50 d63 d72]
919// ymm6 <- [d43 d52 d61 d70]
920// ymm7 <- [d42 d53 d60 d71]
921// ymm8 <- [d80 d91 da2 db3]
922// ymm9 <- [d81 d90 da3 db2]
923// ymm10 <- [d83 d92 da1 db0]
924// ymm11 <- [d82 d93 da0 db1]
925// ymm12 <- dirty
926// ymm13 <- dirty
927// ymm14 <- dirty
928// ymm15 <- dirty
929//
930// output arguments:
931// r10 <- &alpha
932// r11 <- &beta
933// r12 <- C
934// r13 <- 4*sdc*sizeof(double)
935// r14 <- dirty
936// r15 <- dirty
937// ymm0 <- [d00 d10 d20 d30]
938// ymm1 <- [d01 d11 d21 d31]
939// ymm2 <- [d02 d12 d22 d32]
940// ymm3 <- [d03 d13 d23 d33]
941// ymm4 <- [d40 d50 d60 d70]
942// ymm5 <- [d41 d51 d61 d71]
943// ymm6 <- [d42 d52 d62 d72]
944// ymm7 <- [d43 d53 d63 d73]
945// ymm8 <- [d80 d90 da0 db0]
946// ymm9 <- [d81 d91 da1 db1]
947// ymm10 <- [d82 d92 da2 db2]
948// ymm11 <- [d83 d93 da3 db3]
949// ymm12 <- dirty
950// ymm13 <- dirty
951// ymm14 <- dirty
952// ymm15 <- dirty
953
954#if MACRO_LEVEL>=1
955 .macro INNER_SCALE_AB_8X8_LIB4
956#else
957 .p2align 4,,15
958#if defined(OS_LINUX)
959 .type inner_scale_ab_8x8_lib4, @function
960inner_scale_ab_8x8_lib4:
961#elif defined(OS_MAC)
962_inner_scale_ab_8x8_lib4:
963#elif defined(OS_WINDOWS)
964 .def inner_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
965inner_scale_ab_8x8_lib4:
966#endif
967#endif
968
969
970 vbroadcastsd 0(%r10), %ymm15 // alpha
971
972 vmulpd %ymm0, %ymm15, %ymm0
973 vmulpd %ymm1, %ymm15, %ymm1
974 vmulpd %ymm2, %ymm15, %ymm2
975 vmulpd %ymm3, %ymm15, %ymm3
976
977 vmulpd %ymm4, %ymm15, %ymm4
978 vmulpd %ymm5, %ymm15, %ymm5
979 vmulpd %ymm6, %ymm15, %ymm6
980 vmulpd %ymm7, %ymm15, %ymm7
981
982 vmulpd %ymm8, %ymm15, %ymm8
983 vmulpd %ymm9, %ymm15, %ymm9
984 vmulpd %ymm10, %ymm15, %ymm10
985 vmulpd %ymm11, %ymm15, %ymm11
986
987 vbroadcastsd 0(%r11), %ymm14 // beta
988
989 vxorpd %ymm15, %ymm15, %ymm15 // 0.0
990
991 vucomisd %xmm15, %xmm14 // beta==0.0 ?
992 je 0f // end
993
994 vmovapd 0(%r12), %ymm15
995 vfmadd231pd %ymm14, %ymm15, %ymm0
996 vmovapd 32(%r12), %ymm15
997 vfmadd231pd %ymm14, %ymm15, %ymm1
998 vmovapd 64(%r12), %ymm15
999 vfmadd231pd %ymm14, %ymm15, %ymm2
1000 vmovapd 96(%r12), %ymm15
1001 vfmadd231pd %ymm14, %ymm15, %ymm3
1002
1003 vmovapd 0(%r12, %r13, 1), %ymm15
1004 vfmadd231pd %ymm14, %ymm15, %ymm4
1005 vmovapd 32(%r12, %r13, 1), %ymm15
1006 vfmadd231pd %ymm14, %ymm15, %ymm5
1007 vmovapd 64(%r12, %r13, 1), %ymm15
1008 vfmadd231pd %ymm14, %ymm15, %ymm6
1009 vmovapd 96(%r12, %r13, 1), %ymm15
1010 vfmadd231pd %ymm14, %ymm15, %ymm7
1011
1012 vmovapd 128(%r12, %r13, 1), %ymm15
1013 vfmadd231pd %ymm14, %ymm15, %ymm8
1014 vmovapd 160(%r12, %r13, 1), %ymm15
1015 vfmadd231pd %ymm14, %ymm15, %ymm9
1016 vmovapd 192(%r12, %r13, 1), %ymm15
1017 vfmadd231pd %ymm14, %ymm15, %ymm10
1018 vmovapd 224(%r12, %r13, 1), %ymm15
1019 vfmadd231pd %ymm14, %ymm15, %ymm11
1020
10210:
1022
1023#if MACRO_LEVEL>=1
1024 .endm
1025#else
1026 ret
1027
1028#if defined(OS_LINUX)
1029 .size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
1030#endif
1031#endif
1032
1033
1034
1035
1036
1037// common inner routine with file scope
1038//
1039// scale for generic alpha and beta
1040//
1041// input arguments:
1042// r10 <- &alpha
1043// r11 <- &beta
1044// r12 <- C
1045// r13 <- 4*sdc*sizeof(double)
1046// r14 <- dirty
1047// r15 <- dirty
1048// ymm0 <- [d00 d11 d22 d33]
1049// ymm1 <- [d01 d10 d23 d32]
1050// ymm2 <- [d03 d12 d21 d30]
1051// ymm3 <- [d02 d13 d20 d31]
1052// ymm4 <- [d40 d51 d62 d73]
1053// ymm5 <- [d41 d50 d63 d72]
1054// ymm6 <- [d43 d52 d61 d70]
1055// ymm7 <- [d42 d53 d60 d71]
1056// ymm8 <- [d80 d91 da2 db3]
1057// ymm9 <- [d81 d90 da3 db2]
1058// ymm10 <- [d83 d92 da1 db0]
1059// ymm11 <- [d82 d93 da0 db1]
1060// ymm12 <- dirty
1061// ymm13 <- dirty
1062// ymm14 <- dirty
1063// ymm15 <- dirty
1064//
1065// output arguments:
1066// r10 <- &alpha
1067// r11 <- &beta
1068// r12 <- C
1069// r13 <- 4*sdc*sizeof(double)
1070// r14 <- dirty
1071// r15 <- dirty
1072// ymm0 <- [d00 d10 d20 d30]
1073// ymm1 <- [d01 d11 d21 d31]
1074// ymm2 <- [d02 d12 d22 d32]
1075// ymm3 <- [d03 d13 d23 d33]
1076// ymm4 <- [d40 d50 d60 d70]
1077// ymm5 <- [d41 d51 d61 d71]
1078// ymm6 <- [d42 d52 d62 d72]
1079// ymm7 <- [d43 d53 d63 d73]
1080// ymm8 <- [d80 d90 da0 db0]
1081// ymm9 <- [d81 d91 da1 db1]
1082// ymm10 <- [d82 d92 da2 db2]
1083// ymm11 <- [d83 d93 da3 db3]
1084// ymm12 <- dirty
1085// ymm13 <- dirty
1086// ymm14 <- dirty
1087// ymm15 <- dirty
1088
1089#if MACRO_LEVEL>=1
1090 .macro INNER_TRAN_SCALE_AB_8X8_LIB4
1091#else
1092 .p2align 4,,15
1093#if defined(OS_LINUX)
1094 .type inner_tran_scale_ab_8x8_lib4, @function
1095inner_tran_scale_ab_8x8_lib4:
1096#elif defined(OS_MAC)
1097_inner_tran_scale_ab_8x8_lib4:
1098#elif defined(OS_WINDOWS)
1099 .def inner_tran_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
1100inner_tran_scale_ab_8x8_lib4:
1101#endif
1102#endif
1103
1104
1105 vunpcklpd %ymm1, %ymm0, %ymm12
1106 vunpckhpd %ymm1, %ymm0, %ymm13
1107 vunpcklpd %ymm3, %ymm2, %ymm14
1108 vunpckhpd %ymm3, %ymm2, %ymm15
1109
1110 vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
1111 vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
1112 vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
1113 vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
1114
1115 vbroadcastsd 0(%r10), %ymm15 // alpha
1116
1117 vmulpd %ymm0, %ymm15, %ymm0
1118 vmulpd %ymm1, %ymm15, %ymm1
1119 vmulpd %ymm2, %ymm15, %ymm2
1120 vmulpd %ymm3, %ymm15, %ymm3
1121
1122 vunpcklpd %ymm5, %ymm4, %ymm12
1123 vunpckhpd %ymm5, %ymm4, %ymm13
1124 vunpcklpd %ymm7, %ymm6, %ymm14
1125 vunpckhpd %ymm7, %ymm6, %ymm15
1126
1127 vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
1128 vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
1129 vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
1130 vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
1131
1132 vbroadcastsd 0(%r10), %ymm15 // alpha
1133
1134 vmulpd %ymm4, %ymm15, %ymm4
1135 vmulpd %ymm5, %ymm15, %ymm5
1136 vmulpd %ymm6, %ymm15, %ymm6
1137 vmulpd %ymm7, %ymm15, %ymm7
1138
1139 vunpcklpd %ymm9, %ymm8, %ymm12
1140 vunpckhpd %ymm9, %ymm8, %ymm13
1141 vunpcklpd %ymm11, %ymm10, %ymm14
1142 vunpckhpd %ymm11, %ymm10, %ymm15
1143
1144 vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
1145 vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
1146 vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
1147 vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
1148
1149 vbroadcastsd 0(%r10), %ymm15 // alpha
1150
1151 vmulpd %ymm8, %ymm15, %ymm8
1152 vmulpd %ymm9, %ymm15, %ymm9
1153 vmulpd %ymm10, %ymm15, %ymm10
1154 vmulpd %ymm11, %ymm15, %ymm11
1155
1156 vbroadcastsd 0(%r11), %ymm14 // beta
1157
1158 vxorpd %ymm15, %ymm15, %ymm15 // 0.0
1159
1160 vucomisd %xmm15, %xmm14 // beta==0.0 ?
1161 je 0f // end
1162
1163 vmovapd 0(%r12), %ymm15
1164 vfmadd231pd %ymm14, %ymm15, %ymm0
1165 vmovapd 32(%r12), %ymm15
1166 vfmadd231pd %ymm14, %ymm15, %ymm1
1167 vmovapd 64(%r12), %ymm15
1168 vfmadd231pd %ymm14, %ymm15, %ymm2
1169 vmovapd 96(%r12), %ymm15
1170 vfmadd231pd %ymm14, %ymm15, %ymm3
1171
1172 vmovapd 128(%r12), %ymm15
1173 vfmadd231pd %ymm14, %ymm15, %ymm4
1174 vmovapd 160(%r12), %ymm15
1175 vfmadd231pd %ymm14, %ymm15, %ymm5
1176 vmovapd 192(%r12), %ymm15
1177 vfmadd231pd %ymm14, %ymm15, %ymm6
1178 vmovapd 224(%r12), %ymm15
1179 vfmadd231pd %ymm14, %ymm15, %ymm7
1180
1181 vmovapd 128(%r12, %r13, 1), %ymm15
1182 vfmadd231pd %ymm14, %ymm15, %ymm8
1183 vmovapd 160(%r12, %r13, 1), %ymm15
1184 vfmadd231pd %ymm14, %ymm15, %ymm9
1185 vmovapd 192(%r12, %r13, 1), %ymm15
1186 vfmadd231pd %ymm14, %ymm15, %ymm10
1187 vmovapd 224(%r12, %r13, 1), %ymm15
1188 vfmadd231pd %ymm14, %ymm15, %ymm11
1189
11900:
1191
1192#if MACRO_LEVEL>=1
1193 .endm
1194#else
1195 ret
1196
1197#if defined(OS_LINUX)
1198 .size inner_tran_scale_ab_8x8_lib4, .-inner_tran_scale_ab_8x8_lib4
1199#endif
1200#endif
1201
1202
1203
1204
1205
1206// common inner routine with file scope
1207//
1208// scale for alpha=1.0 and beta=1.0
1209//
1210// input arguments:
1211// r10 <- C
1212// r11 <- 4*sdc*sizeof(double)
1213// ymm0 <- [d00 d11 d22 d33]
1214// ymm1 <- [d01 d10 d23 d32]
1215// ymm2 <- [d03 d12 d21 d30]
1216// ymm3 <- [d02 d13 d20 d31]
1217// ymm4 <- [d40 d51 d62 d73]
1218// ymm5 <- [d41 d50 d63 d72]
1219// ymm6 <- [d43 d52 d61 d70]
1220// ymm7 <- [d42 d53 d60 d71]
1221// ymm8 <- [d80 d91 da2 db3]
1222// ymm9 <- [d81 d90 da3 db2]
1223// ymm10 <- [d83 d92 da1 db0]
1224// ymm11 <- [d82 d93 da0 db1]
1225// ymm12 <- dirty
1226// ymm13 <- dirty
1227// ymm14 <- dirty
1228// ymm15 <- dirty
1229//
1230// output arguments:
1231// r10 <- C
1232// r11 <- 4*sdc*sizeof(double)
1233// ymm0 <- [d00 d10 d20 d30]
1234// ymm1 <- [d01 d11 d21 d31]
1235// ymm2 <- [d02 d12 d22 d32]
1236// ymm3 <- [d03 d13 d23 d33]
1237// ymm4 <- [d40 d50 d60 d70]
1238// ymm5 <- [d41 d51 d61 d71]
1239// ymm6 <- [d42 d52 d62 d72]
1240// ymm7 <- [d43 d53 d63 d73]
1241// ymm8 <- [d80 d90 da0 db0]
1242// ymm9 <- [d81 d91 da1 db1]
1243// ymm10 <- [d82 d92 da2 db2]
1244// ymm11 <- [d83 d93 da3 db3]
1245// ymm12 <- dirty
1246// ymm13 <- dirty
1247// ymm14 <- dirty
1248// ymm15 <- dirty
1249
1250#if MACRO_LEVEL>=1
1251 .macro INNER_SCALE_11_8X8_LIB4
1252#else
1253 .p2align 4,,15
1254#if defined(OS_LINUX)
1255 .type inner_scale_11_8x8_lib4, @function
1256inner_scale_11_8x8_lib4:
1257#elif defined(OS_MAC)
1258_inner_scale_11_8x8_lib4:
1259#elif defined(OS_WINDOWS)
1260 .def inner_scale_11_8x8_lib4; .scl 2; .type 32; .endef
1261inner_scale_11_8x8_lib4:
1262#endif
1263#endif
1264
1265#if defined(OS_LINUX) | defined(OS_WINDOWS)
1266 vmovapd .LC04(%rip), %ymm14 // beta=1.0
1267#else
1268 vmovapd LC04(%rip), %ymm14 // beta=1.0
1269#endif
1270
1271 vmovapd 0(%r10), %ymm15
1272 vfmadd231pd %ymm14, %ymm15, %ymm0
1273 vmovapd 32(%r10), %ymm15
1274 vfmadd231pd %ymm14, %ymm15, %ymm1
1275 vmovapd 64(%r10), %ymm15
1276 vfmadd231pd %ymm14, %ymm15, %ymm2
1277 vmovapd 96(%r10), %ymm15
1278 vfmadd231pd %ymm14, %ymm15, %ymm3
1279
1280 vmovapd 0(%r10, %r11, 1), %ymm15
1281 vfmadd231pd %ymm14, %ymm15, %ymm4
1282 vmovapd 32(%r10, %r11, 1), %ymm15
1283 vfmadd231pd %ymm14, %ymm15, %ymm5
1284 vmovapd 64(%r10, %r11, 1), %ymm15
1285 vfmadd231pd %ymm14, %ymm15, %ymm6
1286 vmovapd 96(%r10, %r11, 1), %ymm15
1287 vfmadd231pd %ymm14, %ymm15, %ymm7
1288
1289 vmovapd 128(%r10, %r11, 1), %ymm15
1290 vfmadd231pd %ymm14, %ymm15, %ymm8
1291 vmovapd 160(%r10, %r11, 1), %ymm15
1292 vfmadd231pd %ymm14, %ymm15, %ymm9
1293 vmovapd 192(%r10, %r11, 1), %ymm15
1294 vfmadd231pd %ymm14, %ymm15, %ymm10
1295 vmovapd 224(%r10, %r11, 1), %ymm15
1296 vfmadd231pd %ymm14, %ymm15, %ymm11
1297
12980:
1299
1300#if MACRO_LEVEL>=1
1301 .endm
1302#else
1303 ret
1304
1305#if defined(OS_LINUX)
1306 .size inner_scale_11_8x8_lib4, .-inner_scale_11_8x8_lib4
1307#endif
1308#endif
1309
1310
1311
1312
1313
1314// common inner routine with file scope
1315//
1316// scale for alpha=1.0 and beta=1.0
1317//
1318// input arguments:
1319// r10 <- C
1320// r11 <- 4*sdc*sizeof(double)
1321// ymm0 <- [d00 d11 d22 d33]
1322// ymm1 <- [d01 d10 d23 d32]
1323// ymm2 <- [d03 d12 d21 d30]
1324// ymm3 <- [d02 d13 d20 d31]
1325// ymm4 <- [d40 d51 d62 d73]
1326// ymm5 <- [d41 d50 d63 d72]
1327// ymm6 <- [d43 d52 d61 d70]
1328// ymm7 <- [d42 d53 d60 d71]
1329// ymm8 <- [d80 d91 da2 db3]
1330// ymm9 <- [d81 d90 da3 db2]
1331// ymm10 <- [d83 d92 da1 db0]
1332// ymm11 <- [d82 d93 da0 db1]
1333// ymm12 <- dirty
1334// ymm13 <- dirty
1335// ymm14 <- dirty
1336// ymm15 <- dirty
1337//
1338// output arguments:
1339// r10 <- C
1340// r11 <- 4*sdc*sizeof(double)
1341// ymm0 <- [d00 d10 d20 d30]
1342// ymm1 <- [d01 d11 d21 d31]
1343// ymm2 <- [d02 d12 d22 d32]
1344// ymm3 <- [d03 d13 d23 d33]
1345// ymm4 <- [d40 d50 d60 d70]
1346// ymm5 <- [d41 d51 d61 d71]
1347// ymm6 <- [d42 d52 d62 d72]
1348// ymm7 <- [d43 d53 d63 d73]
1349// ymm8 <- [d80 d90 da0 db0]
1350// ymm9 <- [d81 d91 da1 db1]
1351// ymm10 <- [d82 d92 da2 db2]
1352// ymm11 <- [d83 d93 da3 db3]
1353// ymm12 <- dirty
1354// ymm13 <- dirty
1355// ymm14 <- dirty
1356// ymm15 <- dirty
1357
1358#if MACRO_LEVEL>=1
1359 .macro INNER_TRAN_SCALE_11_8X8_LIB4
1360#else
1361 .p2align 4,,15
1362#if defined(OS_LINUX)
1363 .type inner_tran_scale_11_8x8_lib4, @function
1364inner_tran_scale_11_8x8_lib4:
1365#elif defined(OS_MAC)
1366_inner_tran_scale_11_8x8_lib4:
1367#elif defined(OS_WINDOWS)
1368 .def inner_tran_scale_11_8x8_lib4; .scl 2; .type 32; .endef
1369inner_tran_scale_11_8x8_lib4:
1370#endif
1371#endif
1372
1373
1374 vunpcklpd %ymm1, %ymm0, %ymm12
1375 vunpckhpd %ymm1, %ymm0, %ymm13
1376 vunpcklpd %ymm3, %ymm2, %ymm14
1377 vunpckhpd %ymm3, %ymm2, %ymm15
1378
1379 vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
1380 vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
1381 vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
1382 vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
1383
1384 vunpcklpd %ymm5, %ymm4, %ymm12
1385 vunpckhpd %ymm5, %ymm4, %ymm13
1386 vunpcklpd %ymm7, %ymm6, %ymm14
1387 vunpckhpd %ymm7, %ymm6, %ymm15
1388
1389 vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
1390 vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
1391 vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
1392 vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
1393
1394 vunpcklpd %ymm9, %ymm8, %ymm12
1395 vunpckhpd %ymm9, %ymm8, %ymm13
1396 vunpcklpd %ymm11, %ymm10, %ymm14
1397 vunpckhpd %ymm11, %ymm10, %ymm15
1398
1399 vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
1400 vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
1401 vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
1402 vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
1403
1404#if defined(OS_LINUX) | defined(OS_WINDOWS)
1405 vmovapd .LC04(%rip), %ymm14 // beta=1.0
1406#else
1407 vmovapd LC04(%rip), %ymm14 // beta=1.0
1408#endif
1409
1410 vmovapd 0(%r10), %ymm15
1411 vfmadd231pd %ymm14, %ymm15, %ymm0
1412 vmovapd 32(%r10), %ymm15
1413 vfmadd231pd %ymm14, %ymm15, %ymm1
1414 vmovapd 64(%r10), %ymm15
1415 vfmadd231pd %ymm14, %ymm15, %ymm2
1416 vmovapd 96(%r10), %ymm15
1417 vfmadd231pd %ymm14, %ymm15, %ymm3
1418
1419 vmovapd 128(%r10), %ymm15
1420 vfmadd231pd %ymm14, %ymm15, %ymm4
1421 vmovapd 160(%r10), %ymm15
1422 vfmadd231pd %ymm14, %ymm15, %ymm5
1423 vmovapd 192(%r10), %ymm15
1424 vfmadd231pd %ymm14, %ymm15, %ymm6
1425 vmovapd 224(%r10), %ymm15
1426 vfmadd231pd %ymm14, %ymm15, %ymm7
1427
1428 vmovapd 128(%r10, %r11, 1), %ymm15
1429 vfmadd231pd %ymm14, %ymm15, %ymm8
1430 vmovapd 160(%r10, %r11, 1), %ymm15
1431 vfmadd231pd %ymm14, %ymm15, %ymm9
1432 vmovapd 192(%r10, %r11, 1), %ymm15
1433 vfmadd231pd %ymm14, %ymm15, %ymm10
1434 vmovapd 224(%r10, %r11, 1), %ymm15
1435 vfmadd231pd %ymm14, %ymm15, %ymm11
1436
14370:
1438
1439#if MACRO_LEVEL>=1
1440 .endm
1441#else
1442 ret
1443
1444#if defined(OS_LINUX)
1445 .size inner_tran_scale_11_8x8_lib4, .-inner_tran_scale_11_8x8_lib4
1446#endif
1447#endif
1448
1449
1450
1451
1452
1453// common inner routine with file scope
1454//
1455// cholesky factorization
1456//
1457// input arguments:
1458// r10 <- inv_diag_E
1459// r11d <- kn
1460// ymm12 <- dirty
1461// ymm13 <- dirty
1462// ymm14 <- dirty
1463// ymm15 <- dirty
1464//
1465// output arguments:
1466// r10 <- inv_diag_E
1467// r11d <- kn
1468// ymm12 <- dirty
1469// ymm13 <- dirty
1470// ymm14 <- dirty
1471// ymm15 <- dirty
1472
1473#if MACRO_LEVEL>=1
1474 .macro INNER_EDGE_DPOTRF_8X8_VS_LIB4
1475#else
1476 .p2align 4,,15
1477#if defined(OS_LINUX)
1478 .type inner_edge_dpotrf_8x8_vs_lib4, @function
1479inner_edge_dpotrf_8x8_vs_lib4:
1480#elif defined(OS_MAC)
1481_inner_edge_dpotrf_8x8_vs_lib4:
1482#elif defined(OS_WINDOWS)
1483 .def inner_edge_dpotrf_8x8_vs_lib4; .scl 2; .type 32; .endef
1484inner_edge_dpotrf_8x8_vs_lib4:
1485#endif
1486#endif
1487
1488 vxorpd %ymm15, %ymm15, %ymm15 // 0.0
1489#if defined(OS_LINUX) | defined(OS_WINDOWS)
1490 vmovsd .LC04(%rip), %xmm14 // 1.0
1491#elif defined(OS_MAC)
1492 vmovsd LC04(%rip), %xmm14 // 1.0
1493#endif
1494
1495 vmovsd %xmm0, %xmm0, %xmm13
1496 vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
1497 jbe 1f
1498 vsqrtsd %xmm13, %xmm13, %xmm13
1499 vdivsd %xmm13, %xmm14, %xmm13
15002:
1501 vmovsd %xmm13, 0(%r10)
1502 vpermpd $0x00, %ymm13, %ymm13
1503 vmulpd %ymm0, %ymm13, %ymm0
1504 vmulpd %ymm4, %ymm13, %ymm4
1505 vpermpd $0x55, %ymm0, %ymm13
1506 vfnmadd231pd %ymm0, %ymm13, %ymm1
1507 vfnmadd231pd %ymm4, %ymm13, %ymm5
1508 vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
1509 vpermilpd $0x0, %ymm12, %ymm13
1510 vfnmadd231pd %ymm0, %ymm13, %ymm2
1511 vfnmadd231pd %ymm4, %ymm13, %ymm6
1512 vpermilpd $0xf, %ymm12, %ymm13
1513 vfnmadd231pd %ymm0, %ymm13, %ymm3
1514 vfnmadd231pd %ymm4, %ymm13, %ymm7
1515 vperm2f128 $0x00, %ymm4, %ymm4, %ymm12
1516 vpermilpd $0x0, %ymm12, %ymm13
1517 vfnmadd231pd %ymm4, %ymm13, %ymm8
1518 vpermilpd $0xf, %ymm12, %ymm13
1519 vfnmadd231pd %ymm4, %ymm13, %ymm9
1520 vperm2f128 $0x11, %ymm4, %ymm4, %ymm12
1521 vpermilpd $0x0, %ymm12, %ymm13
1522 vfnmadd231pd %ymm4, %ymm13, %ymm10
1523 vpermilpd $0xf, %ymm12, %ymm13
1524 vfnmadd231pd %ymm4, %ymm13, %ymm11
1525
1526 vpermilpd $0x3, %xmm1, %xmm13
1527 vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
1528 jbe 3f
1529 vsqrtsd %xmm13, %xmm13, %xmm13
1530 vdivsd %xmm13, %xmm14, %xmm13
15314:
1532 vmovsd %xmm13, 8(%r10)
1533 vpermpd $0x00, %ymm13, %ymm13
1534 vmulpd %ymm1, %ymm13, %ymm1
1535 vmulpd %ymm5, %ymm13, %ymm5
1536 vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
1537 vpermilpd $0x0, %ymm12, %ymm13
1538 vfnmadd231pd %ymm1, %ymm13, %ymm2
1539 vfnmadd231pd %ymm5, %ymm13, %ymm6
1540 vpermilpd $0xf, %ymm12, %ymm13
1541 vfnmadd231pd %ymm1, %ymm13, %ymm3
1542 vfnmadd231pd %ymm5, %ymm13, %ymm7
1543 vperm2f128 $0x00, %ymm5, %ymm5, %ymm12
1544 vpermilpd $0x0, %ymm12, %ymm13
1545 vfnmadd231pd %ymm5, %ymm13, %ymm8
1546 vpermilpd $0xf, %ymm12, %ymm13
1547 vfnmadd231pd %ymm5, %ymm13, %ymm9
1548 vperm2f128 $0x11, %ymm5, %ymm5, %ymm12
1549 vpermilpd $0x0, %ymm12, %ymm13
1550 vfnmadd231pd %ymm5, %ymm13, %ymm10
1551 vpermilpd $0xf, %ymm12, %ymm13
1552 vfnmadd231pd %ymm5, %ymm13, %ymm11
1553
1554 vextractf128 $0x1, %ymm2, %xmm13
1555 vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
1556 jbe 5f
1557 vsqrtsd %xmm13, %xmm13, %xmm13
1558 vdivsd %xmm13, %xmm14, %xmm13
15596:
1560 vmovsd %xmm13, 16(%r10)
1561 vpermpd $0x00, %ymm13, %ymm13
1562 vmulpd %ymm2, %ymm13, %ymm2
1563 vmulpd %ymm6, %ymm13, %ymm6
1564 vpermpd $0xff, %ymm2, %ymm13
1565 vfnmadd231pd %ymm2, %ymm13, %ymm3
1566 vfnmadd231pd %ymm6, %ymm13, %ymm7
1567 vperm2f128 $0x00, %ymm6, %ymm6, %ymm12
1568 vpermilpd $0x0, %ymm12, %ymm13
1569 vfnmadd231pd %ymm6, %ymm13, %ymm8
1570 vpermilpd $0xf, %ymm12, %ymm13
1571 vfnmadd231pd %ymm6, %ymm13, %ymm9
1572 vperm2f128 $0x11, %ymm6, %ymm6, %ymm12
1573 vpermilpd $0x0, %ymm12, %ymm13
1574 vfnmadd231pd %ymm6, %ymm13, %ymm10
1575 vpermilpd $0xf, %ymm12, %ymm13
1576 vfnmadd231pd %ymm6, %ymm13, %ymm11
1577
1578 vpermpd $0xff, %ymm3, %ymm13
1579 vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
1580 jbe 7f
1581 vsqrtsd %xmm13, %xmm13, %xmm13
1582 vdivsd %xmm13, %xmm14, %xmm13
15838:
1584 vmovsd %xmm13, 24(%r10)
1585 vpermpd $0x00, %ymm13, %ymm13
1586 vmulpd %ymm3, %ymm13, %ymm3
1587 vmulpd %ymm7, %ymm13, %ymm7
1588 vperm2f128 $0x00, %ymm7, %ymm7, %ymm12
1589 vpermilpd $0x0, %ymm12, %ymm13
1590 vfnmadd231pd %ymm7, %ymm13, %ymm8
1591 vpermilpd $0xf, %ymm12, %ymm13
1592 vfnmadd231pd %ymm7, %ymm13, %ymm9
1593 vperm2f128 $0x11, %ymm7, %ymm7, %ymm12
1594 vpermilpd $0x0, %ymm12, %ymm13
1595 vfnmadd231pd %ymm7, %ymm13, %ymm10
1596 vpermilpd $0xf, %ymm12, %ymm13
1597 vfnmadd231pd %ymm7, %ymm13, %ymm11
1598
1599 vmovsd %xmm8, %xmm8, %xmm13
1600 vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
1601 jbe 9f
1602 vsqrtsd %xmm13, %xmm13, %xmm13
1603 vdivsd %xmm13, %xmm14, %xmm13
160410:
1605 vmovsd %xmm13, 32(%r10)
1606// vmovddup %xmm13, %xmm13
1607// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
1608 vpermpd $0x00, %ymm13, %ymm13
1609 vmulpd %ymm8, %ymm13, %ymm8
1610 cmpl $6, %r11d
1611 jl 0f // ret
1612// vperm2f128 $0x00, %ymm8, %ymm8, %ymm12
1613// vpermilpd $0xf, %ymm12, %ymm13
1614 vpermpd $0x55, %ymm8, %ymm13
1615 vfnmadd231pd %ymm8, %ymm13, %ymm9
1616 vperm2f128 $0x11, %ymm8, %ymm8, %ymm12
1617 vpermilpd $0x0, %ymm12, %ymm13
1618 vfnmadd231pd %ymm8, %ymm13, %ymm10
1619 vpermilpd $0xf, %ymm12, %ymm13
1620 vfnmadd231pd %ymm8, %ymm13, %ymm11
1621
1622 vpermilpd $0x3, %xmm9, %xmm13
1623 vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
1624 jbe 11f
1625 vsqrtsd %xmm13, %xmm13, %xmm13
1626 vdivsd %xmm13, %xmm14, %xmm13
162712:
1628 vmovsd %xmm13, 40(%r10)
1629// vmovddup %xmm13, %xmm13
1630// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
1631 vpermpd $0x00, %ymm13, %ymm13
1632 vmulpd %ymm9, %ymm13, %ymm9
1633 cmpl $7, %r11d
1634 jl 0f // ret
1635 vperm2f128 $0x11, %ymm9, %ymm9, %ymm12
1636 vpermilpd $0x0, %ymm12, %ymm13
1637 vfnmadd231pd %ymm9, %ymm13, %ymm10
1638 vpermilpd $0xf, %ymm12, %ymm13
1639 vfnmadd231pd %ymm9, %ymm13, %ymm11
1640
1641 vextractf128 $0x1, %ymm10, %xmm13
1642 vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
1643 jbe 13f
1644 vsqrtsd %xmm13, %xmm13, %xmm13
1645 vdivsd %xmm13, %xmm14, %xmm13
164614:
1647 vmovsd %xmm13, 48(%r10)
1648// vmovddup %xmm13, %xmm13
1649// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
1650 vpermpd $0x00, %ymm13, %ymm13
1651 vmulpd %ymm10, %ymm13, %ymm10
1652 cmpl $8, %r11d
1653 jl 0f // ret
1654// vperm2f128 $0x11, %ymm10, %ymm10, %ymm12
1655// vpermilpd $0xf, %ymm12, %ymm13
1656 vpermpd $0xff, %ymm10, %ymm13
1657 vfnmadd231pd %ymm10, %ymm13, %ymm11
1658
1659// vextractf128 $0x1, %ymm11, %xmm13
1660// vpermilpd $0x3, %xmm13, %xmm13
1661 vpermpd $0xff, %ymm11, %ymm13
1662 vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
1663 jbe 15f
1664 vsqrtsd %xmm13, %xmm13, %xmm13
1665 vdivsd %xmm13, %xmm14, %xmm13
166616:
1667 vmovsd %xmm13, 56(%r10)
1668// vmovddup %xmm13, %xmm13
1669// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
1670 vpermpd $0x00, %ymm13, %ymm13
1671 vmulpd %ymm11, %ymm13, %ymm11
1672
1673
1674
1675 jmp 0f
1676
16771:
1678 vxorpd %ymm13, %ymm13, %ymm13
1679 jmp 2b
1680
16813:
1682 vxorpd %ymm13, %ymm13, %ymm13
1683 jmp 4b
1684
16855:
1686 vxorpd %ymm13, %ymm13, %ymm13
1687 jmp 6b
1688
16897:
1690 vxorpd %ymm13, %ymm13, %ymm13
1691 jmp 8b
1692
16939:
1694 vxorpd %ymm13, %ymm13, %ymm13
1695 jmp 10b
1696
169711:
1698 vxorpd %ymm13, %ymm13, %ymm13
1699 jmp 12b
1700
170113:
1702 vxorpd %ymm13, %ymm13, %ymm13
1703 jmp 14b
1704
170515:
1706 vxorpd %ymm13, %ymm13, %ymm13
1707 jmp 16b
1708
17090:
1710
1711#if MACRO_LEVEL>=1
1712 .endm
1713#else
1714 ret
1715
1716#if defined(OS_LINUX)
1717 .size inner_edge_dpotrf_8x8_vs_lib4, .-inner_edge_dpotrf_8x8_vs_lib4
1718#endif
1719#endif
1720
1721
1722
1723
1724
1725// common inner routine with file scope
1726//
1727// triangular substitution for cholesky factorization
1728//
1729// input arguments:
1730// r10 <- E
1731// r11 <- inv_diag_E
1732// ymm0 <- [d00 d11 d22 d33]
1733// ymm1 <- [d01 d10 d23 d32]
1734// ymm2 <- [d03 d12 d21 d30]
1735// ymm3 <- [d02 d13 d20 d31]
1736// ymm4 <- [d40 d51 d62 d73]
1737// ymm5 <- [d41 d50 d63 d72]
1738// ymm6 <- [d43 d52 d61 d70]
1739// ymm7 <- [d42 d53 d60 d71]
1740// ymm12 <- dirty
1741// ymm13 <- dirty
1742//
1743// output arguments:
1744// r10 <- E
1745// r11 <- inv_diag_E
1746// ymm0 <- [d00 d10 d20 d30]
1747// ymm1 <- [d01 d11 d21 d31]
1748// ymm2 <- [d02 d12 d22 d32]
1749// ymm3 <- [d03 d13 d23 d33]
1750// ymm4 <- [d40 d51 d62 d73]
1751// ymm5 <- [d41 d50 d63 d72]
1752// ymm6 <- [d43 d52 d61 d70]
1753// ymm7 <- [d42 d53 d60 d71]
1754// ymm12 <- dirty
1755// ymm13 <- dirty
1756
1757#if MACRO_LEVEL>=1
1758 .macro INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
1759#else
1760 .p2align 4,,15
1761#if defined(OS_LINUX)
1762 .type inner_edge_dtrsm_rlt_inv_8x8l_lib4, @function
1763inner_edge_dtrsm_rlt_inv_8x8l_lib4:
1764#elif defined(OS_MAC)
1765_inner_edge_dtrsm_rlt_inv_8x8l_lib4:
1766#elif defined(OS_WINDOWS)
1767 .def inner_edge_dtrsm_rlt_inv_8x8l_lib4; .scl 2; .type 32; .endef
1768inner_edge_dtrsm_rlt_inv_8x8l_lib4:
1769#endif
1770#endif
1771
1772 vbroadcastsd 0(%r12), %ymm13
1773 vmulpd %ymm0, %ymm13, %ymm0
1774 vmulpd %ymm4, %ymm13, %ymm4
1775 vbroadcastsd 8(%r10), %ymm13
1776 vfnmadd231pd %ymm0, %ymm13, %ymm1
1777 vfnmadd231pd %ymm4, %ymm13, %ymm5
1778 vbroadcastsd 16(%r10), %ymm13
1779 vfnmadd231pd %ymm0, %ymm13, %ymm2
1780 vfnmadd231pd %ymm4, %ymm13, %ymm6
1781 vbroadcastsd 24(%r10), %ymm13
1782 vfnmadd231pd %ymm0, %ymm13, %ymm3
1783 vfnmadd231pd %ymm4, %ymm13, %ymm7
1784 vbroadcastsd 0(%r10, %r11, 1), %ymm13
1785 vfnmadd231pd %ymm4, %ymm13, %ymm8
1786 vbroadcastsd 8(%r10, %r11, 1), %ymm13
1787 vfnmadd231pd %ymm4, %ymm13, %ymm9
1788 vbroadcastsd 16(%r10, %r11, 1), %ymm13
1789 vfnmadd231pd %ymm4, %ymm13, %ymm10
1790 vbroadcastsd 24(%r10, %r11, 1), %ymm13
1791 vfnmadd231pd %ymm4, %ymm13, %ymm11
1792
1793 vbroadcastsd 8(%r12), %ymm13
1794 vmulpd %ymm1, %ymm13, %ymm1
1795 vmulpd %ymm5, %ymm13, %ymm5
1796 vbroadcastsd 48(%r10), %ymm13
1797 vfnmadd231pd %ymm1, %ymm13, %ymm2
1798 vfnmadd231pd %ymm5, %ymm13, %ymm6
1799 vbroadcastsd 56(%r10), %ymm13
1800 vfnmadd231pd %ymm1, %ymm13, %ymm3
1801 vfnmadd231pd %ymm5, %ymm13, %ymm7
1802 vbroadcastsd 32(%r10, %r11, 1), %ymm13
1803 vfnmadd231pd %ymm5, %ymm13, %ymm8
1804 vbroadcastsd 40(%r10, %r11, 1), %ymm13
1805 vfnmadd231pd %ymm5, %ymm13, %ymm9
1806 vbroadcastsd 48(%r10, %r11, 1), %ymm13
1807 vfnmadd231pd %ymm5, %ymm13, %ymm10
1808 vbroadcastsd 56(%r10, %r11, 1), %ymm13
1809 vfnmadd231pd %ymm5, %ymm13, %ymm11
1810
1811 vbroadcastsd 16(%r12), %ymm13
1812 vmulpd %ymm2, %ymm13, %ymm2
1813 vmulpd %ymm6, %ymm13, %ymm6
1814 vbroadcastsd 88(%r10), %ymm13
1815 vfnmadd231pd %ymm2, %ymm13, %ymm3
1816 vfnmadd231pd %ymm6, %ymm13, %ymm7
1817 vbroadcastsd 64(%r10, %r11, 1), %ymm13
1818 vfnmadd231pd %ymm6, %ymm13, %ymm8
1819 vbroadcastsd 72(%r10, %r11, 1), %ymm13
1820 vfnmadd231pd %ymm6, %ymm13, %ymm9
1821 vbroadcastsd 80(%r10, %r11, 1), %ymm13
1822 vfnmadd231pd %ymm6, %ymm13, %ymm10
1823 vbroadcastsd 88(%r10, %r11, 1), %ymm13
1824 vfnmadd231pd %ymm6, %ymm13, %ymm11
1825
1826 vbroadcastsd 24(%r12), %ymm13
1827 vmulpd %ymm3, %ymm13, %ymm3
1828 vmulpd %ymm7, %ymm13, %ymm7
1829 vbroadcastsd 96(%r10, %r11, 1), %ymm13
1830 vfnmadd231pd %ymm7, %ymm13, %ymm8
1831 vbroadcastsd 104(%r10, %r11, 1), %ymm13
1832 vfnmadd231pd %ymm7, %ymm13, %ymm9
1833 vbroadcastsd 112(%r10, %r11, 1), %ymm13
1834 vfnmadd231pd %ymm7, %ymm13, %ymm10
1835 vbroadcastsd 120(%r10, %r11, 1), %ymm13
1836 vfnmadd231pd %ymm7, %ymm13, %ymm11
1837 addq $128, %r10
1838
1839 vbroadcastsd 32(%r12), %ymm13
1840 vmulpd %ymm8, %ymm13, %ymm8
1841 vbroadcastsd 8(%r10, %r11, 1), %ymm13
1842 vfnmadd231pd %ymm8, %ymm13, %ymm9
1843 vbroadcastsd 16(%r10, %r11, 1), %ymm13
1844 vfnmadd231pd %ymm8, %ymm13, %ymm10
1845 vbroadcastsd 24(%r10, %r11, 1), %ymm13
1846 vfnmadd231pd %ymm8, %ymm13, %ymm11
1847
1848 vbroadcastsd 40(%r12), %ymm13
1849 vmulpd %ymm9, %ymm13, %ymm9
1850 vbroadcastsd 48(%r10, %r11, 1), %ymm13
1851 vfnmadd231pd %ymm9, %ymm13, %ymm10
1852 vbroadcastsd 56(%r10, %r11, 1), %ymm13
1853 vfnmadd231pd %ymm9, %ymm13, %ymm11
1854
1855 vbroadcastsd 48(%r12), %ymm13
1856 vmulpd %ymm10, %ymm13, %ymm10
1857 vbroadcastsd 88(%r10, %r11, 1), %ymm13
1858 vfnmadd231pd %ymm10, %ymm13, %ymm11
1859
1860 vbroadcastsd 56(%r12), %ymm13
1861 vmulpd %ymm11, %ymm13, %ymm11
1862
1863#if MACRO_LEVEL>=1
1864 .endm
1865#else
1866 ret
1867
1868#if defined(OS_LINUX)
1869 .size inner_edge_dtrsm_rlt_inv_8x8l_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_lib4
1870#endif
1871#endif
1872
1873
1874
1875
1876
1877// common inner routine with file scope
1878//
1879// triangular substitution for cholesky factorization
1880//
1881// input arguments:
1882// r10 <- E
1883// r11 <- sde
1884// r12 <- inv_diag_E
1885// r13 <- D
1886// r14 <- sdd
1887// ymm0 <- [d00 d11 d22 d33]
1888// ymm1 <- [d01 d10 d23 d32]
1889// ymm2 <- [d03 d12 d21 d30]
1890// ymm3 <- [d02 d13 d20 d31]
1891// ymm4 <- [d40 d51 d62 d73]
1892// ymm5 <- [d41 d50 d63 d72]
1893// ymm6 <- [d43 d52 d61 d70]
1894// ymm7 <- [d42 d53 d60 d71]
1895// ymm12 <- dirty
1896// ymm13 <- dirty
1897//
1898// output arguments:
1899// r10 <- E
1900// r11 <- sde
1901// r12 <- inv_diag_E
1902// r13 <- D
1903// r14 <- sdd
1904// ymm0 <- [d00 d10 d20 d30]
1905// ymm1 <- [d01 d11 d21 d31]
1906// ymm2 <- [d02 d12 d22 d32]
1907// ymm3 <- [d03 d13 d23 d33]
1908// ymm4 <- [d40 d51 d62 d73]
1909// ymm5 <- [d41 d50 d63 d72]
1910// ymm6 <- [d43 d52 d61 d70]
1911// ymm7 <- [d42 d53 d60 d71]
1912// ymm12 <- dirty
1913// ymm13 <- dirty
1914
1915#if MACRO_LEVEL>=1
1916 .macro INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
1917#else
1918 .p2align 4,,15
1919#if defined(OS_LINUX)
1920 .type inner_edge_dtrsm_rlt_inv_8x8u_lib4, @function
1921inner_edge_dtrsm_rlt_inv_8x8u_lib4:
1922#elif defined(OS_MAC)
1923_inner_edge_dtrsm_rlt_inv_8x8u_lib4:
1924#elif defined(OS_WINDOWS)
1925 .def inner_edge_dtrsm_rlt_inv_8x8u_lib4; .scl 2; .type 32; .endef
1926inner_edge_dtrsm_rlt_inv_8x8u_lib4:
1927#endif
1928#endif
1929
1930 vbroadcastsd 0(%r12), %ymm13
1931 vmulpd %ymm0, %ymm13, %ymm0
1932 vbroadcastsd 8(%r10), %ymm13
1933 vfnmadd231pd %ymm0, %ymm13, %ymm1
1934 vbroadcastsd 16(%r10), %ymm13
1935 vfnmadd231pd %ymm0, %ymm13, %ymm2
1936 vbroadcastsd 24(%r10), %ymm13
1937 vfnmadd231pd %ymm0, %ymm13, %ymm3
1938
1939 vmovapd 0(%r13, %r14, 1), %ymm12
1940 vbroadcastsd 0(%r10, %r11, 1), %ymm13
1941 vfnmadd231pd %ymm0, %ymm13, %ymm4
1942 vfnmadd231pd %ymm12, %ymm13, %ymm8
1943 vbroadcastsd 8(%r10, %r11, 1), %ymm13
1944 vfnmadd231pd %ymm0, %ymm13, %ymm5
1945 vfnmadd231pd %ymm12, %ymm13, %ymm9
1946 vbroadcastsd 16(%r10, %r11, 1), %ymm13
1947 vfnmadd231pd %ymm0, %ymm13, %ymm6
1948 vfnmadd231pd %ymm12, %ymm13, %ymm10
1949 vbroadcastsd 24(%r10, %r11, 1), %ymm13
1950 vfnmadd231pd %ymm0, %ymm13, %ymm7
1951 vfnmadd231pd %ymm12, %ymm13, %ymm11
1952
1953
1954 vbroadcastsd 8(%r12), %ymm13
1955 vmulpd %ymm1, %ymm13, %ymm1
1956 vbroadcastsd 48(%r10), %ymm13
1957 vfnmadd231pd %ymm1, %ymm13, %ymm2
1958 vbroadcastsd 56(%r10), %ymm13
1959 vfnmadd231pd %ymm1, %ymm13, %ymm3
1960
1961 vmovapd 32(%r13, %r14, 1), %ymm12
1962 vbroadcastsd 32(%r10, %r11, 1), %ymm13
1963 vfnmadd231pd %ymm1, %ymm13, %ymm4
1964 vfnmadd231pd %ymm12, %ymm13, %ymm8
1965 vbroadcastsd 40(%r10, %r11, 1), %ymm13
1966 vfnmadd231pd %ymm1, %ymm13, %ymm5
1967 vfnmadd231pd %ymm12, %ymm13, %ymm9
1968 vbroadcastsd 48(%r10, %r11, 1), %ymm13
1969 vfnmadd231pd %ymm1, %ymm13, %ymm6
1970 vfnmadd231pd %ymm12, %ymm13, %ymm10
1971 vbroadcastsd 56(%r10, %r11, 1), %ymm13
1972 vfnmadd231pd %ymm1, %ymm13, %ymm7
1973 vfnmadd231pd %ymm12, %ymm13, %ymm11
1974
1975
1976 vbroadcastsd 16(%r12), %ymm13
1977 vmulpd %ymm2, %ymm13, %ymm2
1978 vbroadcastsd 88(%r10), %ymm13
1979 vfnmadd231pd %ymm2, %ymm13, %ymm3
1980
1981 vmovapd 64(%r13, %r14, 1), %ymm12
1982 vbroadcastsd 64(%r10, %r11, 1), %ymm13
1983 vfnmadd231pd %ymm2, %ymm13, %ymm4
1984 vfnmadd231pd %ymm12, %ymm13, %ymm8
1985 vbroadcastsd 72(%r10, %r11, 1), %ymm13
1986 vfnmadd231pd %ymm2, %ymm13, %ymm5
1987 vfnmadd231pd %ymm12, %ymm13, %ymm9
1988 vbroadcastsd 80(%r10, %r11, 1), %ymm13
1989 vfnmadd231pd %ymm2, %ymm13, %ymm6
1990 vfnmadd231pd %ymm12, %ymm13, %ymm10
1991 vbroadcastsd 88(%r10, %r11, 1), %ymm13
1992 vfnmadd231pd %ymm2, %ymm13, %ymm7
1993 vfnmadd231pd %ymm12, %ymm13, %ymm11
1994
1995
1996 vbroadcastsd 24(%r12), %ymm13
1997 vmulpd %ymm3, %ymm13, %ymm3
1998
1999 vmovapd 96(%r13, %r14, 1), %ymm12
2000 vbroadcastsd 96(%r10, %r11, 1), %ymm13
2001 vfnmadd231pd %ymm3, %ymm13, %ymm4
2002 vfnmadd231pd %ymm12, %ymm13, %ymm8
2003 vbroadcastsd 104(%r10, %r11, 1), %ymm13
2004 vfnmadd231pd %ymm3, %ymm13, %ymm5
2005 vfnmadd231pd %ymm12, %ymm13, %ymm9
2006 vbroadcastsd 112(%r10, %r11, 1), %ymm13
2007 vfnmadd231pd %ymm3, %ymm13, %ymm6
2008 vfnmadd231pd %ymm12, %ymm13, %ymm10
2009 vbroadcastsd 120(%r10, %r11, 1), %ymm13
2010 vfnmadd231pd %ymm3, %ymm13, %ymm7
2011 vfnmadd231pd %ymm12, %ymm13, %ymm11
2012
2013 addq $128, %r10
2014
2015 vbroadcastsd 32(%r12), %ymm13
2016 vmulpd %ymm4, %ymm13, %ymm4
2017 vmulpd %ymm8, %ymm13, %ymm8
2018 vbroadcastsd 8(%r10, %r11, 1), %ymm13
2019 vfnmadd231pd %ymm4, %ymm13, %ymm5
2020 vfnmadd231pd %ymm8, %ymm13, %ymm9
2021 vbroadcastsd 16(%r10, %r11, 1), %ymm13
2022 vfnmadd231pd %ymm4, %ymm13, %ymm6
2023 vfnmadd231pd %ymm8, %ymm13, %ymm10
2024 vbroadcastsd 24(%r10, %r11, 1), %ymm13
2025 vfnmadd231pd %ymm4, %ymm13, %ymm7
2026 vfnmadd231pd %ymm8, %ymm13, %ymm11
2027
2028 vbroadcastsd 40(%r12), %ymm13
2029 vmulpd %ymm5, %ymm13, %ymm5
2030 vmulpd %ymm9, %ymm13, %ymm9
2031 vbroadcastsd 48(%r10, %r11, 1), %ymm13
2032 vfnmadd231pd %ymm5, %ymm13, %ymm6
2033 vfnmadd231pd %ymm9, %ymm13, %ymm10
2034 vbroadcastsd 56(%r10, %r11, 1), %ymm13
2035 vfnmadd231pd %ymm5, %ymm13, %ymm7
2036 vfnmadd231pd %ymm9, %ymm13, %ymm11
2037
2038 vbroadcastsd 48(%r12), %ymm13
2039 vmulpd %ymm6, %ymm13, %ymm6
2040 vmulpd %ymm10, %ymm13, %ymm10
2041 vbroadcastsd 88(%r10, %r11, 1), %ymm13
2042 vfnmadd231pd %ymm6, %ymm13, %ymm7
2043 vfnmadd231pd %ymm10, %ymm13, %ymm11
2044
2045 vbroadcastsd 56(%r12), %ymm13
2046 vmulpd %ymm7, %ymm13, %ymm7
2047 vmulpd %ymm11, %ymm13, %ymm11
2048
2049#if MACRO_LEVEL>=1
2050 .endm
2051#else
2052 ret
2053
2054#if defined(OS_LINUX)
2055 .size inner_edge_dtrsm_rlt_inv_8x8u_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_lib4
2056#endif
2057#endif
2058
2059
2060
2061
2062
2063// common inner routine with file scope
2064//
2065// triangular substitution for cholesky factorization
2066//
2067// input arguments:
2068// r10 <- D
2069// r11 <- inv_diag_D
2070// r12d <- kn
2071// ymm0 <- [d00 d11 d22 d33]
2072// ymm1 <- [d01 d10 d23 d32]
2073// ymm2 <- [d03 d12 d21 d30]
2074// ymm3 <- [d02 d13 d20 d31]
2075// ymm4 <- [d40 d51 d62 d73]
2076// ymm5 <- [d41 d50 d63 d72]
2077// ymm6 <- [d43 d52 d61 d70]
2078// ymm7 <- [d42 d53 d60 d71]
2079// ymm12 <- dirty
2080// ymm13 <- dirty
2081//
2082// output arguments:
2083// r10 <- D
2084// r11 <- inv_diag_D
2085// r12d <- kn
2086// ymm0 <- [d00 d10 d20 d30]
2087// ymm1 <- [d01 d11 d21 d31]
2088// ymm2 <- [d02 d12 d22 d32]
2089// ymm3 <- [d03 d13 d23 d33]
2090// ymm4 <- [d40 d51 d62 d73]
2091// ymm5 <- [d41 d50 d63 d72]
2092// ymm6 <- [d43 d52 d61 d70]
2093// ymm7 <- [d42 d53 d60 d71]
2094// ymm12 <- dirty
2095// ymm13 <- dirty
2096
2097#if MACRO_LEVEL>=1
2098 .macro INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
2099#else
2100 .p2align 4,,15
2101#if defined(OS_LINUX)
2102 .type inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, @function
2103inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
2104#elif defined(OS_MAC)
2105_inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
2106#elif defined(OS_WINDOWS)
2107 .def inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
2108inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
2109#endif
2110#endif
2111
2112 vbroadcastsd 0(%r12), %ymm13
2113 vmulpd %ymm0, %ymm13, %ymm0
2114 vmulpd %ymm4, %ymm13, %ymm4
2115 vbroadcastsd 8(%r10), %ymm13
2116 vfnmadd231pd %ymm0, %ymm13, %ymm1
2117 vfnmadd231pd %ymm4, %ymm13, %ymm5
2118 vbroadcastsd 16(%r10), %ymm13
2119 vfnmadd231pd %ymm0, %ymm13, %ymm2
2120 vfnmadd231pd %ymm4, %ymm13, %ymm6
2121 vbroadcastsd 24(%r10), %ymm13
2122 vfnmadd231pd %ymm0, %ymm13, %ymm3
2123 vfnmadd231pd %ymm4, %ymm13, %ymm7
2124 vbroadcastsd 0(%r10, %r11, 1), %ymm13
2125 vfnmadd231pd %ymm4, %ymm13, %ymm8
2126 vbroadcastsd 8(%r10, %r11, 1), %ymm13
2127 vfnmadd231pd %ymm4, %ymm13, %ymm9
2128 vbroadcastsd 16(%r10, %r11, 1), %ymm13
2129 vfnmadd231pd %ymm4, %ymm13, %ymm10
2130 vbroadcastsd 24(%r10, %r11, 1), %ymm13
2131 vfnmadd231pd %ymm4, %ymm13, %ymm11
2132
2133 vbroadcastsd 8(%r12), %ymm13
2134 vmulpd %ymm1, %ymm13, %ymm1
2135 vmulpd %ymm5, %ymm13, %ymm5
2136 vbroadcastsd 48(%r10), %ymm13
2137 vfnmadd231pd %ymm1, %ymm13, %ymm2
2138 vfnmadd231pd %ymm5, %ymm13, %ymm6
2139 vbroadcastsd 56(%r10), %ymm13
2140 vfnmadd231pd %ymm1, %ymm13, %ymm3
2141 vfnmadd231pd %ymm5, %ymm13, %ymm7
2142 vbroadcastsd 32(%r10, %r11, 1), %ymm13
2143 vfnmadd231pd %ymm5, %ymm13, %ymm8
2144 vbroadcastsd 40(%r10, %r11, 1), %ymm13
2145 vfnmadd231pd %ymm5, %ymm13, %ymm9
2146 vbroadcastsd 48(%r10, %r11, 1), %ymm13
2147 vfnmadd231pd %ymm5, %ymm13, %ymm10
2148 vbroadcastsd 56(%r10, %r11, 1), %ymm13
2149 vfnmadd231pd %ymm5, %ymm13, %ymm11
2150
2151 vbroadcastsd 16(%r12), %ymm13
2152 vmulpd %ymm2, %ymm13, %ymm2
2153 vmulpd %ymm6, %ymm13, %ymm6
2154 vbroadcastsd 88(%r10), %ymm13
2155 vfnmadd231pd %ymm2, %ymm13, %ymm3
2156 vfnmadd231pd %ymm6, %ymm13, %ymm7
2157 vbroadcastsd 64(%r10, %r11, 1), %ymm13
2158 vfnmadd231pd %ymm6, %ymm13, %ymm8
2159 vbroadcastsd 72(%r10, %r11, 1), %ymm13
2160 vfnmadd231pd %ymm6, %ymm13, %ymm9
2161 vbroadcastsd 80(%r10, %r11, 1), %ymm13
2162 vfnmadd231pd %ymm6, %ymm13, %ymm10
2163 vbroadcastsd 88(%r10, %r11, 1), %ymm13
2164 vfnmadd231pd %ymm6, %ymm13, %ymm11
2165
2166 vbroadcastsd 24(%r12), %ymm13
2167 vmulpd %ymm3, %ymm13, %ymm3
2168 vmulpd %ymm7, %ymm13, %ymm7
2169 vbroadcastsd 96(%r10, %r11, 1), %ymm13
2170 vfnmadd231pd %ymm7, %ymm13, %ymm8
2171 vbroadcastsd 104(%r10, %r11, 1), %ymm13
2172 vfnmadd231pd %ymm7, %ymm13, %ymm9
2173 vbroadcastsd 112(%r10, %r11, 1), %ymm13
2174 vfnmadd231pd %ymm7, %ymm13, %ymm10
2175 vbroadcastsd 120(%r10, %r11, 1), %ymm13
2176 vfnmadd231pd %ymm7, %ymm13, %ymm11
2177 addq $128, %r10
2178
2179 vbroadcastsd 32(%r12), %ymm13
2180 vmulpd %ymm8, %ymm13, %ymm8
2181 cmpl $6, %r13d
2182 jl 0f // ret
2183 vbroadcastsd 8(%r10, %r11, 1), %ymm13
2184 vfnmadd231pd %ymm8, %ymm13, %ymm9
2185 vbroadcastsd 16(%r10, %r11, 1), %ymm13
2186 vfnmadd231pd %ymm8, %ymm13, %ymm10
2187 vbroadcastsd 24(%r10, %r11, 1), %ymm13
2188 vfnmadd231pd %ymm8, %ymm13, %ymm11
2189
2190 vbroadcastsd 40(%r12), %ymm13
2191 vmulpd %ymm9, %ymm13, %ymm9
2192 cmpl $7, %r13d
2193 jl 0f // ret
2194 vbroadcastsd 48(%r10, %r11, 1), %ymm13
2195 vfnmadd231pd %ymm9, %ymm13, %ymm10
2196 vbroadcastsd 56(%r10, %r11, 1), %ymm13
2197 vfnmadd231pd %ymm9, %ymm13, %ymm11
2198
2199 vbroadcastsd 48(%r12), %ymm13
2200 vmulpd %ymm10, %ymm13, %ymm10
2201 cmpl $8, %r13d
2202 jl 0f // ret
2203 vbroadcastsd 88(%r10, %r11, 1), %ymm13
2204 vfnmadd231pd %ymm10, %ymm13, %ymm11
2205
2206 vbroadcastsd 56(%r12), %ymm13
2207 vmulpd %ymm11, %ymm13, %ymm11
2208
22090:
2210
2211#if MACRO_LEVEL>=1
2212 .endm
2213#else
2214 ret
2215
2216#if defined(OS_LINUX)
2217 .size inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
2218#endif
2219#endif
2220
2221
2222
2223
2224
2225// common inner routine with file scope
2226//
2227// triangular substitution for cholesky factorization
2228//
2229// input arguments:
2230// r10 <- E
2231// r11 <- sde
2232// r12 <- inv_diag_E
2233// r13 <- D
2234// r14 <- sdd
2235// r15d <- kn
2236// ymm0 <- [d00 d11 d22 d33]
2237// ymm1 <- [d01 d10 d23 d32]
2238// ymm2 <- [d03 d12 d21 d30]
2239// ymm3 <- [d02 d13 d20 d31]
2240// ymm4 <- [d40 d51 d62 d73]
2241// ymm5 <- [d41 d50 d63 d72]
2242// ymm6 <- [d43 d52 d61 d70]
2243// ymm7 <- [d42 d53 d60 d71]
2244// ymm12 <- dirty
2245// ymm13 <- dirty
2246//
2247// output arguments:
2248// r10 <- E
2249// r11 <- sde
2250// r12 <- inv_diag_E
2251// r13 <- D
2252// r14 <- sdd
2253// r15d <- kn
2254// ymm0 <- [d00 d10 d20 d30]
2255// ymm1 <- [d01 d11 d21 d31]
2256// ymm2 <- [d02 d12 d22 d32]
2257// ymm3 <- [d03 d13 d23 d33]
2258// ymm4 <- [d40 d51 d62 d73]
2259// ymm5 <- [d41 d50 d63 d72]
2260// ymm6 <- [d43 d52 d61 d70]
2261// ymm7 <- [d42 d53 d60 d71]
2262// ymm12 <- dirty
2263// ymm13 <- dirty
2264
2265#if MACRO_LEVEL>=1
2266 .macro INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
2267#else
2268 .p2align 4,,15
2269#if defined(OS_LINUX)
2270 .type inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, @function
2271inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
2272#elif defined(OS_MAC)
2273_inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
2274#elif defined(OS_WINDOWS)
2275 .def inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
2276inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
2277#endif
2278#endif
2279
2280 vbroadcastsd 0(%r12), %ymm13
2281 vmulpd %ymm0, %ymm13, %ymm0
2282 vbroadcastsd 8(%r10), %ymm13
2283 vfnmadd231pd %ymm0, %ymm13, %ymm1
2284 vbroadcastsd 16(%r10), %ymm13
2285 vfnmadd231pd %ymm0, %ymm13, %ymm2
2286 vbroadcastsd 24(%r10), %ymm13
2287 vfnmadd231pd %ymm0, %ymm13, %ymm3
2288
2289 vmovapd 0(%r13, %r14, 1), %ymm12
2290 vbroadcastsd 0(%r10, %r11, 1), %ymm13
2291 vfnmadd231pd %ymm0, %ymm13, %ymm4
2292 vfnmadd231pd %ymm12, %ymm13, %ymm8
2293 vbroadcastsd 8(%r10, %r11, 1), %ymm13
2294 vfnmadd231pd %ymm0, %ymm13, %ymm5
2295 vfnmadd231pd %ymm12, %ymm13, %ymm9
2296 vbroadcastsd 16(%r10, %r11, 1), %ymm13
2297 vfnmadd231pd %ymm0, %ymm13, %ymm6
2298 vfnmadd231pd %ymm12, %ymm13, %ymm10
2299 vbroadcastsd 24(%r10, %r11, 1), %ymm13
2300 vfnmadd231pd %ymm0, %ymm13, %ymm7
2301 vfnmadd231pd %ymm12, %ymm13, %ymm11
2302
2303
2304 vbroadcastsd 8(%r12), %ymm13
2305 vmulpd %ymm1, %ymm13, %ymm1
2306 vbroadcastsd 48(%r10), %ymm13
2307 vfnmadd231pd %ymm1, %ymm13, %ymm2
2308 vbroadcastsd 56(%r10), %ymm13
2309 vfnmadd231pd %ymm1, %ymm13, %ymm3
2310
2311 vmovapd 32(%r13, %r14, 1), %ymm12
2312 vbroadcastsd 32(%r10, %r11, 1), %ymm13
2313 vfnmadd231pd %ymm1, %ymm13, %ymm4
2314 vfnmadd231pd %ymm12, %ymm13, %ymm8
2315 vbroadcastsd 40(%r10, %r11, 1), %ymm13
2316 vfnmadd231pd %ymm1, %ymm13, %ymm5
2317 vfnmadd231pd %ymm12, %ymm13, %ymm9
2318 vbroadcastsd 48(%r10, %r11, 1), %ymm13
2319 vfnmadd231pd %ymm1, %ymm13, %ymm6
2320 vfnmadd231pd %ymm12, %ymm13, %ymm10
2321 vbroadcastsd 56(%r10, %r11, 1), %ymm13
2322 vfnmadd231pd %ymm1, %ymm13, %ymm7
2323 vfnmadd231pd %ymm12, %ymm13, %ymm11
2324
2325
2326 vbroadcastsd 16(%r12), %ymm13
2327 vmulpd %ymm2, %ymm13, %ymm2
2328 vbroadcastsd 88(%r10), %ymm13
2329 vfnmadd231pd %ymm2, %ymm13, %ymm3
2330
2331 vmovapd 64(%r13, %r14, 1), %ymm12
2332 vbroadcastsd 64(%r10, %r11, 1), %ymm13
2333 vfnmadd231pd %ymm2, %ymm13, %ymm4
2334 vfnmadd231pd %ymm12, %ymm13, %ymm8
2335 vbroadcastsd 72(%r10, %r11, 1), %ymm13
2336 vfnmadd231pd %ymm2, %ymm13, %ymm5
2337 vfnmadd231pd %ymm12, %ymm13, %ymm9
2338 vbroadcastsd 80(%r10, %r11, 1), %ymm13
2339 vfnmadd231pd %ymm2, %ymm13, %ymm6
2340 vfnmadd231pd %ymm12, %ymm13, %ymm10
2341 vbroadcastsd 88(%r10, %r11, 1), %ymm13
2342 vfnmadd231pd %ymm2, %ymm13, %ymm7
2343 vfnmadd231pd %ymm12, %ymm13, %ymm11
2344
2345
2346 vbroadcastsd 24(%r12), %ymm13
2347 vmulpd %ymm3, %ymm13, %ymm3
2348
2349 vmovapd 96(%r13, %r14, 1), %ymm12
2350 vbroadcastsd 96(%r10, %r11, 1), %ymm13
2351 vfnmadd231pd %ymm3, %ymm13, %ymm4
2352 vfnmadd231pd %ymm12, %ymm13, %ymm8
2353 vbroadcastsd 104(%r10, %r11, 1), %ymm13
2354 vfnmadd231pd %ymm3, %ymm13, %ymm5
2355 vfnmadd231pd %ymm12, %ymm13, %ymm9
2356 vbroadcastsd 112(%r10, %r11, 1), %ymm13
2357 vfnmadd231pd %ymm3, %ymm13, %ymm6
2358 vfnmadd231pd %ymm12, %ymm13, %ymm10
2359 vbroadcastsd 120(%r10, %r11, 1), %ymm13
2360 vfnmadd231pd %ymm3, %ymm13, %ymm7
2361 vfnmadd231pd %ymm12, %ymm13, %ymm11
2362
2363 addq $128, %r10
2364
2365 vbroadcastsd 32(%r12), %ymm13
2366 vmulpd %ymm4, %ymm13, %ymm4
2367 vmulpd %ymm8, %ymm13, %ymm8
2368 cmpl $6, %r15d
2369 jl 0f // ret
2370 vbroadcastsd 8(%r10, %r11, 1), %ymm13
2371 vfnmadd231pd %ymm4, %ymm13, %ymm5
2372 vfnmadd231pd %ymm8, %ymm13, %ymm9
2373 vbroadcastsd 16(%r10, %r11, 1), %ymm13
2374 vfnmadd231pd %ymm4, %ymm13, %ymm6
2375 vfnmadd231pd %ymm8, %ymm13, %ymm10
2376 vbroadcastsd 24(%r10, %r11, 1), %ymm13
2377 vfnmadd231pd %ymm4, %ymm13, %ymm7
2378 vfnmadd231pd %ymm8, %ymm13, %ymm11
2379
2380 vbroadcastsd 40(%r12), %ymm13
2381 vmulpd %ymm5, %ymm13, %ymm5
2382 vmulpd %ymm9, %ymm13, %ymm9
2383 cmpl $7, %r15d
2384 jl 0f // ret
2385 vbroadcastsd 48(%r10, %r11, 1), %ymm13
2386 vfnmadd231pd %ymm5, %ymm13, %ymm6
2387 vfnmadd231pd %ymm9, %ymm13, %ymm10
2388 vbroadcastsd 56(%r10, %r11, 1), %ymm13
2389 vfnmadd231pd %ymm5, %ymm13, %ymm7
2390 vfnmadd231pd %ymm9, %ymm13, %ymm11
2391
2392 vbroadcastsd 48(%r12), %ymm13
2393 vmulpd %ymm6, %ymm13, %ymm6
2394 vmulpd %ymm10, %ymm13, %ymm10
2395 cmpl $8, %r15d
2396 jl 0f // ret
2397 vbroadcastsd 88(%r10, %r11, 1), %ymm13
2398 vfnmadd231pd %ymm6, %ymm13, %ymm7
2399 vfnmadd231pd %ymm10, %ymm13, %ymm11
2400
2401 vbroadcastsd 56(%r12), %ymm13
2402 vmulpd %ymm7, %ymm13, %ymm7
2403 vmulpd %ymm11, %ymm13, %ymm11
2404
2405
2406
2407// subq $128, %r10
2408// vmovapd 0(%r10, %r11, 1), %ymm4
2409// vmovapd 32(%r10, %r11, 1), %ymm5
2410// vmovapd 64(%r10, %r11, 1), %ymm6
2411// vmovapd 96(%r10, %r11, 1), %ymm7
2412
2413
2414
24150:
2416
2417#if MACRO_LEVEL>=1
2418 .endm
2419#else
2420 ret
2421
2422#if defined(OS_LINUX)
2423 .size inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
2424#endif
2425#endif
2426
2427
2428
2429
2430
2431// common inner routine with file scope
2432//
2433// store n
2434//
2435// input arguments:
2436// r10 <- D
2437// r11 <- 4*sdd*sizeof(double)
2438// r14 <- dirty
2439// r15 <- dirty
2440// ymm0 <- [d00 d10 d20 d30]
2441// ymm1 <- [d01 d11 d21 d31]
2442// ymm2 <- [d02 d12 d22 d32]
2443// ymm3 <- [d03 d13 d23 d33]
2444// ymm4 <- [d40 d50 d60 d70]
2445// ymm5 <- [d41 d51 d61 d71]
2446// ymm6 <- [d42 d52 d62 d72]
2447// ymm7 <- [d43 d53 d63 d73]
2448// ymm8 <- [d80 d90 da0 db0]
2449// ymm9 <- [d81 d91 da1 db1]
2450// ymm10 <- [d82 d92 da2 db2]
2451// ymm11 <- [d83 d93 da3 db3]
2452//
2453// output arguments:
2454// r10 <- D
2455// r11 <- 4*sdd*sizeof(double)
2456// r14 <- dirty
2457// r15 <- dirty
2458// ymm0 <- [d00 d10 d20 d30]
2459// ymm1 <- [d01 d11 d21 d31]
2460// ymm2 <- [d02 d12 d22 d32]
2461// ymm3 <- [d03 d13 d23 d33]
2462// ymm4 <- [d40 d50 d60 d70]
2463// ymm5 <- [d41 d51 d61 d71]
2464// ymm6 <- [d42 d52 d62 d72]
2465// ymm7 <- [d43 d53 d63 d73]
2466// ymm8 <- [d80 d90 da0 db0]
2467// ymm9 <- [d81 d91 da1 db1]
2468// ymm10 <- [d82 d92 da2 db2]
2469// ymm11 <- [d83 d93 da3 db3]
2470
2471#if MACRO_LEVEL>=1
2472 .macro INNER_STORE_8X8L_LIB4
2473#else
2474 .p2align 4,,15
2475#if defined(OS_LINUX)
2476 .type inner_store_8x8l_lib4, @function
2477inner_store_8x8l_lib4:
2478#elif defined(OS_MAC)
2479_inner_store_8x8l_lib4:
2480#elif defined(OS_WINDOWS)
2481 .def inner_store_8x8l_lib4; .scl 2; .type 32; .endef
2482inner_store_8x8l_lib4:
2483#endif
2484#endif
2485
2486 vmovapd %ymm0, 0(%r10)
2487 vmovapd %ymm1, 32(%r10)
2488 vmovapd %ymm2, 64(%r10)
2489 vmovapd %ymm3, 96(%r10)
2490
2491 vmovapd %ymm4, 0(%r10, %r11, 1)
2492 vmovapd %ymm5, 32(%r10, %r11, 1)
2493 vmovapd %ymm6, 64(%r10, %r11, 1)
2494 vmovapd %ymm7, 96(%r10, %r11, 1)
2495
2496 vmovapd %ymm8, 128(%r10, %r11, 1)
2497 vmovapd %ymm9, 160(%r10, %r11, 1)
2498 vmovapd %ymm10, 192(%r10, %r11, 1)
2499 vmovapd %ymm11, 224(%r10, %r11, 1)
2500
2501#if MACRO_LEVEL>=1
2502 .endm
2503#else
2504 ret
2505
2506#if defined(OS_LINUX)
2507 .size inner_store_8x8l_lib4, .-inner_store_8x8l_lib4
2508#endif
2509#endif
2510
2511
2512
2513
2514
2515// common inner routine with file scope
2516//
2517// store n
2518//
2519// input arguments:
2520// r10 <- D
2521// r11 <- 4*sdd*sizeof(double)
2522// r14 <- dirty
2523// r15 <- dirty
2524// ymm0 <- [d00 d10 d20 d30]
2525// ymm1 <- [d01 d11 d21 d31]
2526// ymm2 <- [d02 d12 d22 d32]
2527// ymm3 <- [d03 d13 d23 d33]
2528// ymm4 <- [d40 d50 d60 d70]
2529// ymm5 <- [d41 d51 d61 d71]
2530// ymm6 <- [d42 d52 d62 d72]
2531// ymm7 <- [d43 d53 d63 d73]
2532// ymm8 <- [d80 d90 da0 db0]
2533// ymm9 <- [d81 d91 da1 db1]
2534// ymm10 <- [d82 d92 da2 db2]
2535// ymm11 <- [d83 d93 da3 db3]
2536//
2537// output arguments:
2538// r10 <- D
2539// r11 <- 4*sdd*sizeof(double)
2540// r14 <- dirty
2541// r15 <- dirty
2542// ymm0 <- [d00 d10 d20 d30]
2543// ymm1 <- [d01 d11 d21 d31]
2544// ymm2 <- [d02 d12 d22 d32]
2545// ymm3 <- [d03 d13 d23 d33]
2546// ymm4 <- [d40 d50 d60 d70]
2547// ymm5 <- [d41 d51 d61 d71]
2548// ymm6 <- [d42 d52 d62 d72]
2549// ymm7 <- [d43 d53 d63 d73]
2550// ymm8 <- [d80 d90 da0 db0]
2551// ymm9 <- [d81 d91 da1 db1]
2552// ymm10 <- [d82 d92 da2 db2]
2553// ymm11 <- [d83 d93 da3 db3]
2554
2555#if MACRO_LEVEL>=1
2556 .macro INNER_STORE_8X8U_LIB4
2557#else
2558 .p2align 4,,15
2559#if defined(OS_LINUX)
2560 .type inner_store_8x8u_lib4, @function
2561inner_store_8x8u_lib4:
2562#elif defined(OS_MAC)
2563_inner_store_8x8u_lib4:
2564#elif defined(OS_WINDOWS)
2565 .def inner_store_8x8u_lib4; .scl 2; .type 32; .endef
2566inner_store_8x8u_lib4:
2567#endif
2568#endif
2569
2570 vmovapd %ymm0, 0(%r10)
2571 vmovapd %ymm1, 32(%r10)
2572 vmovapd %ymm2, 64(%r10)
2573 vmovapd %ymm3, 96(%r10)
2574
2575 vmovapd %ymm4, 128(%r10)
2576 vmovapd %ymm5, 160(%r10)
2577 vmovapd %ymm6, 192(%r10)
2578 vmovapd %ymm7, 224(%r10)
2579
2580 vmovapd %ymm8, 128(%r10, %r11, 1)
2581 vmovapd %ymm9, 160(%r10, %r11, 1)
2582 vmovapd %ymm10, 192(%r10, %r11, 1)
2583 vmovapd %ymm11, 224(%r10, %r11, 1)
2584
2585#if MACRO_LEVEL>=1
2586 .endm
2587#else
2588 ret
2589
2590#if defined(OS_LINUX)
2591 .size inner_store_8x8u_lib4, .-inner_store_8x8u_lib4
2592#endif
2593#endif
2594
2595
2596
2597
2598
2599// common inner routine with file scope
2600//
2601// store n
2602//
2603// input arguments:
2604// r10 <- D
2605// r11 <- 4*sdd*sizeof(double)
2606// r12d <- km
2607// r13d <- kn
2608// r14 <- dirty
2609// r15 <- dirty
2610// ymm0 <- [d00 d10 d20 d30]
2611// ymm1 <- [d01 d11 d21 d31]
2612// ymm2 <- [d02 d12 d22 d32]
2613// ymm3 <- [d03 d13 d23 d33]
2614// ymm4 <- [d40 d50 d60 d70]
2615// ymm5 <- [d41 d51 d61 d71]
2616// ymm6 <- [d42 d52 d62 d72]
2617// ymm7 <- [d43 d53 d63 d73]
2618// ymm8 <- [d80 d90 da0 db0]
2619// ymm9 <- [d81 d91 da1 db1]
2620// ymm10 <- [d82 d92 da2 db2]
2621// ymm11 <- [d83 d93 da3 db3]
2622//
2623// output arguments:
2624// r10 <- D
2625// r11 <- 4*sdd*sizeof(double)
2626// r12d <- km
2627// r13d <- kn
2628// r14 <- dirty
2629// r15 <- dirty
2630// ymm0 <- [d00 d10 d20 d30]
2631// ymm1 <- [d01 d11 d21 d31]
2632// ymm2 <- [d02 d12 d22 d32]
2633// ymm3 <- [d03 d13 d23 d33]
2634// ymm4 <- [d40 d50 d60 d70]
2635// ymm5 <- [d41 d51 d61 d71]
2636// ymm6 <- [d42 d52 d62 d72]
2637// ymm7 <- [d43 d53 d63 d73]
2638// ymm8 <- [d80 d90 da0 db0]
2639// ymm9 <- [d81 d91 da1 db1]
2640// ymm10 <- [d82 d92 da2 db2]
2641// ymm11 <- [d83 d93 da3 db3]
2642
2643#if MACRO_LEVEL>=1
2644 .macro INNER_STORE_8X8L_VS_LIB4
2645#else
2646 .p2align 4,,15
2647#if defined(OS_LINUX)
2648 .type inner_store_8x8l_vs_lib4, @function
2649inner_store_8x8l_vs_lib4:
2650#elif defined(OS_MAC)
2651_inner_store_8x8l_vs_lib4:
2652#elif defined(OS_WINDOWS)
2653 .def inner_store_8x8l_vs_lib4; .scl 2; .type 32; .endef
2654inner_store_8x8l_vs_lib4:
2655#endif
2656#endif
2657
2658 vcvtsi2sd %r12d, %xmm15, %xmm15
2659#if defined(OS_LINUX) | defined(OS_WINDOWS)
2660 vmovupd .LC03(%rip), %ymm14
2661#elif defined(OS_MAC)
2662 vmovupd LC03(%rip), %ymm14
2663#endif
2664 vmovddup %xmm15, %xmm15
2665 vinsertf128 $1, %xmm15, %ymm15, %ymm15
2666 vsubpd %ymm15, %ymm14, %ymm15
2667
2668 vmovapd %ymm0, 0(%r10)
2669 vmovapd %ymm1, 32(%r10)
2670 vmovapd %ymm2, 64(%r10)
2671 vmovapd %ymm3, 96(%r10)
2672
2673 vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
2674 vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
2675 vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
2676 vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
2677
2678 vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
2679 cmpl $6, %r13d
2680 jl 0f // end
2681 vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
2682 cmpl $7, %r13d
2683 jl 0f // end
2684 vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
2685 je 0f // end
2686 vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
2687
2688#if MACRO_LEVEL>=1
2689 .endm
2690#else
2691 ret
2692
2693#if defined(OS_LINUX)
2694 .size inner_store_8x8l_vs_lib4, .-inner_store_8x8l_vs_lib4
2695#endif
2696#endif
2697
2698
2699
2700
2701
2702// common inner routine with file scope
2703//
2704// store n
2705//
2706// input arguments:
2707// r10 <- D
2708// r11 <- 4*sdd*sizeof(double)
2709// r12d <- km
2710// r13d <- kn
2711// r14 <- dirty
2712// r15 <- dirty
2713// ymm0 <- [d00 d10 d20 d30]
2714// ymm1 <- [d01 d11 d21 d31]
2715// ymm2 <- [d02 d12 d22 d32]
2716// ymm3 <- [d03 d13 d23 d33]
2717// ymm4 <- [d40 d50 d60 d70]
2718// ymm5 <- [d41 d51 d61 d71]
2719// ymm6 <- [d42 d52 d62 d72]
2720// ymm7 <- [d43 d53 d63 d73]
2721// ymm8 <- [d80 d90 da0 db0]
2722// ymm9 <- [d81 d91 da1 db1]
2723// ymm10 <- [d82 d92 da2 db2]
2724// ymm11 <- [d83 d93 da3 db3]
2725//
2726// output arguments:
2727// r10 <- D
2728// r11 <- 4*sdd*sizeof(double)
2729// r12d <- km
2730// r13d <- kn
2731// r14 <- dirty
2732// r15 <- dirty
2733// ymm0 <- [d00 d10 d20 d30]
2734// ymm1 <- [d01 d11 d21 d31]
2735// ymm2 <- [d02 d12 d22 d32]
2736// ymm3 <- [d03 d13 d23 d33]
2737// ymm4 <- [d40 d50 d60 d70]
2738// ymm5 <- [d41 d51 d61 d71]
2739// ymm6 <- [d42 d52 d62 d72]
2740// ymm7 <- [d43 d53 d63 d73]
2741// ymm8 <- [d80 d90 da0 db0]
2742// ymm9 <- [d81 d91 da1 db1]
2743// ymm10 <- [d82 d92 da2 db2]
2744// ymm11 <- [d83 d93 da3 db3]
2745
2746#if MACRO_LEVEL>=1
2747 .macro INNER_STORE_8X8U_VS_LIB4
2748#else
2749 .p2align 4,,15
2750#if defined(OS_LINUX)
2751 .type inner_store_8x8u_vs_lib4, @function
2752inner_store_8x8u_vs_lib4:
2753#elif defined(OS_MAC)
2754_inner_store_8x8u_vs_lib4:
2755#elif defined(OS_WINDOWS)
2756 .def inner_store_8x8u_vs_lib4; .scl 2; .type 32; .endef
2757inner_store_8x8u_vs_lib4:
2758#endif
2759#endif
2760
2761 vcvtsi2sd %r12d, %xmm15, %xmm15
2762#if defined(OS_LINUX) | defined(OS_WINDOWS)
2763 vmovupd .LC03(%rip), %ymm14
2764#elif defined(OS_MAC)
2765 vmovupd LC03(%rip), %ymm14
2766#endif
2767 vmovddup %xmm15, %xmm15
2768 vinsertf128 $1, %xmm15, %ymm15, %ymm15
2769 vsubpd %ymm15, %ymm14, %ymm15
2770
2771 vmovapd %ymm0, 0(%r10)
2772 vmovapd %ymm1, 32(%r10)
2773 vmovapd %ymm2, 64(%r10)
2774 vmovapd %ymm3, 96(%r10)
2775
2776
2777 vmovapd %ymm4, 128(%r10)
2778 vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
2779 cmpl $6, %r13d
2780 jl 0f // end
2781 vmovapd %ymm5, 160(%r10)
2782 vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
2783 cmpl $7, %r13d
2784 jl 0f // end
2785 vmovapd %ymm6, 192(%r10)
2786 vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
2787 je 0f // end
2788 vmovapd %ymm7, 224(%r10)
2789 vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
2790
2791#if MACRO_LEVEL>=1
2792 .endm
2793#else
2794 ret
2795
2796#if defined(OS_LINUX)
2797 .size inner_store_8x8u_vs_lib4, .-inner_store_8x8u_vs_lib4
2798#endif
2799#endif
2800
2801
2802
2803
2804
2805// common inner routine with file scope
2806//
2807// store lower n
2808//
2809// input arguments:
2810// r10 <- D
2811// r11 <- 4*sdd*sizeof(double)
2812// r14 <- dirty
2813// r15 <- dirty
2814// ymm0 <- [d00 d10 d20 d30]
2815// ymm1 <- [d01 d11 d21 d31]
2816// ymm2 <- [d02 d12 d22 d32]
2817// ymm3 <- [d03 d13 d23 d33]
2818// ymm4 <- [d40 d50 d60 d70]
2819// ymm5 <- [d41 d50 d61 d71]
2820// ymm6 <- [d42 d52 d62 d72]
2821// ymm7 <- [d43 d53 d63 d73]
2822// ymm8 <- [d80 d90 da0 db0]
2823// ymm9 <- [d81 d90 da1 db1]
2824// ymm10 <- [d82 d92 da2 db2]
2825// ymm11 <- [d83 d93 da3 db3]
2826// ymm14 <- dirty
2827// ymm15 <- dirty
2828//
2829// output arguments:
2830// r10 <- D
2831// r11 <- 4*sdd*sizeof(double)
2832// r14 <- dirty
2833// r15 <- dirty
2834// ymm0 <- [d00 d10 d20 d30]
2835// ymm1 <- [d01 d11 d21 d31]
2836// ymm2 <- [d02 d12 d22 d32]
2837// ymm3 <- [d03 d13 d23 d33]
2838// ymm4 <- [d40 d50 d60 d70]
2839// ymm5 <- [d41 d51 d61 d71]
2840// ymm6 <- [d42 d52 d62 d72]
2841// ymm7 <- [d43 d53 d63 d73]
2842// ymm8 <- [d80 d90 da0 db0]
2843// ymm9 <- [d81 d90 da1 db1]
2844// ymm10 <- [d82 d92 da2 db2]
2845// ymm11 <- [d83 d93 da3 db3]
2846// ymm14 <- dirty
2847// ymm15 <- dirty
2848
2849#if MACRO_LEVEL>=1
2850 .macro INNER_STORE_L_8X8_LIB4
2851#else
2852 .p2align 4,,15
2853#if defined(OS_LINUX)
2854 .type inner_store_l_8x8_lib4, @function
2855inner_store_l_8x8_lib4:
2856#elif defined(OS_MAC)
2857_inner_store_l_8x8_lib4:
2858#elif defined(OS_WINDOWS)
2859 .def inner_store_l_8x8_lib4; .scl 2; .type 32; .endef
2860inner_store_l_8x8_lib4:
2861#endif
2862#endif
2863
2864 vmovapd %ymm0, 0(%r10)
2865 vmovapd 32(%r10), %ymm14
2866 vblendpd $0x1, %ymm14, %ymm1, %ymm1
2867 vmovapd %ymm1, 32(%r10)
2868 vmovapd 64(%r10), %ymm14
2869 vblendpd $0x3, %ymm14, %ymm2, %ymm2
2870 vmovapd %ymm2, 64(%r10)
2871 vmovapd 96(%r10), %ymm14
2872 vblendpd $0x7, %ymm14, %ymm3, %ymm3
2873 vmovapd %ymm3, 96(%r10)
2874
2875 vmovapd %ymm4, 0(%r10, %r11, 1)
2876 vmovapd %ymm5, 32(%r10, %r11, 1)
2877 vmovapd %ymm6, 64(%r10, %r11, 1)
2878 vmovapd %ymm7, 96(%r10, %r11, 1)
2879
2880 vmovapd %ymm8, 128(%r10, %r11, 1)
2881 vmovapd 160(%r10, %r11, 1), %ymm14
2882 vblendpd $0x1, %ymm14, %ymm9, %ymm9
2883 vmovapd %ymm9, 160(%r10, %r11, 1)
2884 vmovapd 192(%r10, %r11, 1), %ymm14
2885 vblendpd $0x3, %ymm14, %ymm10, %ymm10
2886 vmovapd %ymm10, 192(%r10, %r11, 1)
2887 vmovapd 224(%r10, %r11, 1), %ymm14
2888 vblendpd $0x7, %ymm14, %ymm11, %ymm11
2889 vmovapd %ymm11, 224(%r10, %r11, 1)
2890
28910:
2892
2893#if MACRO_LEVEL>=1
2894 .endm
2895#else
2896 ret
2897
2898#if defined(OS_LINUX)
2899 .size inner_store_l_8x8_lib4, .-inner_store_l_8x8_lib4
2900#endif
2901#endif
2902
2903
2904
2905
2906
2907// common inner routine with file scope
2908//
2909// store lower n
2910//
2911// input arguments:
2912// r10 <- D
2913// r11 <- 4*sdd*sizeof(double)
2914// r12d <- km
2915// r13d <- kn
2916// r14 <- dirty
2917// r15 <- dirty
2918// ymm0 <- [d00 d10 d20 d30]
2919// ymm1 <- [d01 d11 d21 d31]
2920// ymm2 <- [d02 d12 d22 d32]
2921// ymm3 <- [d03 d13 d23 d33]
2922// ymm4 <- [d40 d50 d60 d70]
2923// ymm5 <- [d41 d50 d61 d71]
2924// ymm6 <- [d42 d52 d62 d72]
2925// ymm7 <- [d43 d53 d63 d73]
2926// ymm8 <- [d80 d90 da0 db0]
2927// ymm9 <- [d81 d90 da1 db1]
2928// ymm10 <- [d82 d92 da2 db2]
2929// ymm11 <- [d83 d93 da3 db3]
2930// ymm14 <- dirty
2931// ymm15 <- dirty
2932//
2933// output arguments:
2934// r10 <- D
2935// r11 <- 4*sdd*sizeof(double)
2936// r12d <- km
2937// r13d <- kn
2938// r14 <- dirty
2939// r15 <- dirty
2940// ymm0 <- [d00 d10 d20 d30]
2941// ymm1 <- [d01 d11 d21 d31]
2942// ymm2 <- [d02 d12 d22 d32]
2943// ymm3 <- [d03 d13 d23 d33]
2944// ymm4 <- [d40 d50 d60 d70]
2945// ymm5 <- [d41 d51 d61 d71]
2946// ymm6 <- [d42 d52 d62 d72]
2947// ymm7 <- [d43 d53 d63 d73]
2948// ymm8 <- [d80 d90 da0 db0]
2949// ymm9 <- [d81 d90 da1 db1]
2950// ymm10 <- [d82 d92 da2 db2]
2951// ymm11 <- [d83 d93 da3 db3]
2952// ymm14 <- dirty
2953// ymm15 <- dirty
2954
2955#if MACRO_LEVEL>=1
2956 .macro INNER_STORE_L_8X8_VS_LIB4
2957#else
2958 .p2align 4,,15
2959#if defined(OS_LINUX)
2960 .type inner_store_l_8x8_vs_lib4, @function
2961inner_store_l_8x8_vs_lib4:
2962#elif defined(OS_MAC)
2963_inner_store_l_8x8_vs_lib4:
2964#elif defined(OS_WINDOWS)
2965 .def inner_store_l_8x8_vs_lib4; .scl 2; .type 32; .endef
2966inner_store_l_8x8_vs_lib4:
2967#endif
2968#endif
2969
2970 vcvtsi2sd %r12d, %xmm15, %xmm15
2971#if defined(OS_LINUX) | defined(OS_WINDOWS)
2972 vmovupd .LC03(%rip), %ymm14
2973#elif defined(OS_MAC)
2974 vmovupd LC03(%rip), %ymm14
2975#endif
2976 vmovddup %xmm15, %xmm15
2977 vinsertf128 $1, %xmm15, %ymm15, %ymm15
2978 vsubpd %ymm15, %ymm14, %ymm15
2979
2980 vmovapd %ymm0, 0(%r10)
2981 vmovapd 32(%r10), %ymm14
2982 vblendpd $0x1, %ymm14, %ymm1, %ymm1
2983 vmovapd %ymm1, 32(%r10)
2984 vmovapd 64(%r10), %ymm14
2985 vblendpd $0x3, %ymm14, %ymm2, %ymm2
2986 vmovapd %ymm2, 64(%r10)
2987 vmovapd 96(%r10), %ymm14
2988 vblendpd $0x7, %ymm14, %ymm3, %ymm3
2989 vmovapd %ymm3, 96(%r10)
2990
2991 vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
2992 vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
2993 vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
2994 vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
2995
2996 vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
2997 cmpl $6, %r13d
2998 jl 0f // end
2999 vmovapd 160(%r10, %r11, 1), %ymm14
3000 vblendpd $0x1, %ymm14, %ymm9, %ymm9
3001 vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
3002 cmpl $7, %r13d
3003 jl 0f // end
3004 vmovapd 192(%r10, %r11, 1), %ymm14
3005 vblendpd $0x3, %ymm14, %ymm10, %ymm10
3006 vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
3007 je 0f // end
3008 vmovapd 224(%r10, %r11, 1), %ymm14
3009 vblendpd $0x7, %ymm14, %ymm11, %ymm11
3010 vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
3011
30120:
3013
3014#if MACRO_LEVEL>=1
3015 .endm
3016#else
3017 ret
3018
3019#if defined(OS_LINUX)
3020 .size inner_store_l_8x8_vs_lib4, .-inner_store_l_8x8_vs_lib4
3021#endif
3022#endif
3023
3024
3025
3026
3027
3028// common inner routine with file scope
3029//
3030// store n generalized
3031//
3032// input arguments:
3033// r10 <- offset
3034// r11 <- D
3035// r12 <- 4*sdd*sizeof(double)
3036// r13 <- m0 // row index: start from (inc)
3037// r14 <- m1 // row index: up to (exc)
3038// r15 <- n0 // col index: start from (inc)
3039// rax <- n1 // col index: up to (exc)
3040// rbx <- dirty
3041// rbp <- dirty
3042// ymm0 <- [d00 d11 d22 d33]
3043// ymm1 <- [d01 d10 d23 d32]
3044// ymm2 <- [d03 d12 d21 d30]
3045// ymm3 <- [d02 d13 d20 d31]
3046// ymm4 <- [d40 d50 d60 d70]
3047// ymm5 <- [d41 d51 d61 d71]
3048// ymm6 <- [d42 d52 d62 d72]
3049// ymm7 <- [d43 d53 d63 d73]
3050//
3051// output arguments:
3052// r10 <- offset
3053// r11 <- D
3054// r12 <- 4*sdd*sizeof(double)
3055// r13 <- m0 // row index: start from (inc)
3056// r14 <- m1 // row index: up to (exc)
3057// r15 <- n1-n0
3058// rax <- n1-n0
3059// rbx <- dirty
3060// rbp <- dirty
3061// ymm0 <- [d00 d10 d20 d30]
3062// ymm1 <- [d01 d11 d21 d31]
3063// ymm2 <- [d02 d12 d22 d32]
3064// ymm3 <- [d03 d13 d23 d33]
3065// ymm4 <- [d40 d50 d60 d70]
3066// ymm5 <- [d41 d51 d61 d71]
3067// ymm6 <- [d42 d52 d62 d72]
3068// ymm7 <- [d43 d53 d63 d73]
3069
3070#if MACRO_LEVEL>=1
3071 .macro INNER_STORE_8X8_GEN_LIB4
3072#else
3073 .p2align 4,,15
3074#if defined(OS_LINUX)
3075 .type inner_store_8x8_gen_lib4, @function
3076inner_store_8x8_gen_lib4:
3077#elif defined(OS_MAC)
3078_inner_store_8x8_gen_lib4:
3079#elif defined(OS_WINDOWS)
3080 .def inner_store_8x8_gen_lib4; .scl 2; .type 32; .endef
3081inner_store_8x8_gen_lib4:
3082#endif
3083#endif
3084
3085 // compute mask for rows
3086 vcvtsi2sd %r13d, %xmm14, %xmm14
3087 vcvtsi2sd %r14d, %xmm15, %xmm15
3088#if defined(OS_LINUX) | defined(OS_WINDOWS)
3089 vmovupd .LC02(%rip), %ymm12
3090 vmovupd .LC03(%rip), %ymm13
3091#elif defined(OS_MAC)
3092 vmovupd LC02(%rip), %ymm12
3093 vmovupd LC03(%rip), %ymm13
3094#endif
3095 vmovddup %xmm14, %xmm14
3096 vmovddup %xmm15, %xmm15
3097 vinsertf128 $1, %xmm14, %ymm14, %ymm14
3098 vinsertf128 $1, %xmm15, %ymm15, %ymm15
3099 vsubpd %ymm12, %ymm14, %ymm14
3100 vsubpd %ymm15, %ymm13, %ymm15
3101
3102 // shift D and sol for cols
3103 cmpl $0, %r15d
3104 jle 0f
3105
3106 vmovapd %ymm1, %ymm0
3107 vmovapd %ymm5, %ymm4
3108 vmovapd %ymm2, %ymm1
3109 vmovapd %ymm6, %ymm5
3110 vmovapd %ymm3, %ymm2
3111 vmovapd %ymm7, %ymm6
3112 vmovapd %ymm8, %ymm7
3113 vmovapd %ymm9, %ymm8
3114 vmovapd %ymm10, %ymm9
3115 vmovapd %ymm11, %ymm10
3116 addq $32, %r11
3117
3118 cmpl $1, %r15d
3119 jle 0f
3120
3121 vmovapd %ymm1, %ymm0
3122 vmovapd %ymm5, %ymm4
3123 vmovapd %ymm2, %ymm1
3124 vmovapd %ymm6, %ymm5
3125 vmovapd %ymm7, %ymm6
3126 vmovapd %ymm8, %ymm7
3127 vmovapd %ymm9, %ymm8
3128 vmovapd %ymm10, %ymm9
3129 addq $32, %r11
3130
3131 cmpl $2, %r15d
3132 jle 0f
3133
3134 vmovapd %ymm1, %ymm0
3135 vmovapd %ymm5, %ymm4
3136 vmovapd %ymm6, %ymm5
3137 vmovapd %ymm7, %ymm6
3138 vmovapd %ymm8, %ymm7
3139 vmovapd %ymm9, %ymm8
3140 addq $32, %r11
3141
31420:
3143
3144 // compute number of cols
3145 cmpl $8, %eax
3146 jle 0f
3147 movl $8, %eax
31480:
3149 subl %r15d, %eax
3150 movl %eax, %r15d
3151
3152 cmpl $0, %r10d
3153 jg 0f
3154
3155 // offset==0
3156
3157 vmaskmovpd %ymm0, %ymm14, 0(%r11)
3158 vmaskmovpd %ymm1, %ymm14, 32(%r11)
3159 vmaskmovpd %ymm2, %ymm14, 64(%r11)
3160 vmaskmovpd %ymm3, %ymm14, 96(%r11)
3161
3162 vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
3163 vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
3164 vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
3165 vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
3166
3167 vmaskmovpd %ymm8, %ymm15, 128(%r11, %r12, 1)
3168 cmpl $6, %r15d
3169 jl 4f // end
3170 vmaskmovpd %ymm9, %ymm15, 160(%r11, %r12, 1)
3171 cmpl $7, %r15d
3172 jl 4f // end
3173 vmaskmovpd %ymm10, %ymm15, 192(%r11, %r12, 1)
3174 je 4f // end
3175 vmaskmovpd %ymm11, %ymm15, 224(%r11, %r12, 1)
3176
3177 jmp 4f
3178
31790:
3180
3181 cmpl $1, %r10d
3182 jg 1f
3183
3184 // offset==1
3185
3186 vmovapd %ymm0, %ymm13
3187 vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
3188 vshufpd $0x5, %ymm0, %ymm12, %ymm0
3189 vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
3190 vshufpd $0x5, %ymm4, %ymm12, %ymm4
3191
3192 vmovapd %ymm1, %ymm13
3193 vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
3194 vshufpd $0x5, %ymm1, %ymm12, %ymm1
3195 vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
3196 vshufpd $0x5, %ymm5, %ymm12, %ymm5
3197
3198 vmovapd %ymm2, %ymm13
3199 vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
3200 vshufpd $0x5, %ymm2, %ymm12, %ymm2
3201 vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
3202 vshufpd $0x5, %ymm6, %ymm12, %ymm6
3203
3204 vmovapd %ymm3, %ymm13
3205 vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
3206 vshufpd $0x5, %ymm3, %ymm12, %ymm3
3207 vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
3208 vshufpd $0x5, %ymm7, %ymm12, %ymm7
3209
3210 vperm2f128 $0x01, %ymm8, %ymm8, %ymm12
3211 vshufpd $0x5, %ymm8, %ymm12, %ymm8
3212
3213 vperm2f128 $0x01, %ymm9, %ymm9, %ymm12
3214 vshufpd $0x5, %ymm9, %ymm12, %ymm9
3215
3216 vperm2f128 $0x01, %ymm10, %ymm10, %ymm12
3217 vshufpd $0x5, %ymm10, %ymm12, %ymm10
3218
3219 vperm2f128 $0x01, %ymm11, %ymm11, %ymm12
3220 vshufpd $0x5, %ymm11, %ymm12, %ymm11
3221
3222 vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
3223 vshufpd $0x5, %ymm15, %ymm12, %ymm15
3224 vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
3225 vshufpd $0x5, %ymm14, %ymm12, %ymm14
3226
3227#if defined(OS_LINUX) | defined(OS_WINDOWS)
3228 vandpd .LC08(%rip), %ymm14, %ymm12
3229 vandpd .LC05(%rip), %ymm15, %ymm13
3230#elif defined(OS_MAC)
3231 vandpd LC08(%rip), %ymm14, %ymm12
3232 vandpd LC05(%rip), %ymm15, %ymm13
3233#endif
3234
3235 vblendpd $0x1, %ymm14, %ymm15, %ymm14
3236
3237#if defined(OS_LINUX) | defined(OS_WINDOWS)
3238 vandpd .LC08(%rip), %ymm15, %ymm15
3239#elif defined(OS_MAC)
3240 vandpd LC08(%rip), %ymm15, %ymm15
3241#endif
3242
3243 jmp 3f
3244
32451:
3246
3247 cmpl $2, %r10d
3248 jg 2f
3249
3250 // offset==2
3251
3252 vmovapd %ymm0, %ymm13
3253 vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
3254 vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
3255
3256 vmovapd %ymm1, %ymm13
3257 vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
3258 vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
3259
3260 vmovapd %ymm2, %ymm13
3261 vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
3262 vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
3263
3264 vmovapd %ymm3, %ymm13
3265 vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
3266 vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
3267
3268 vperm2f128 $0x01, %ymm8, %ymm8, %ymm8
3269
3270 vperm2f128 $0x01, %ymm9, %ymm9, %ymm9
3271
3272 vperm2f128 $0x01, %ymm10, %ymm10, %ymm10
3273
3274 vperm2f128 $0x01, %ymm11, %ymm11, %ymm11
3275
3276 vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
3277 vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
3278
3279#if defined(OS_LINUX) | defined(OS_WINDOWS)
3280 vandpd .LC09(%rip), %ymm14, %ymm12
3281 vandpd .LC06(%rip), %ymm15, %ymm13
3282#elif defined(OS_MAC)
3283 vandpd LC09(%rip), %ymm14, %ymm12
3284 vandpd LC06(%rip), %ymm15, %ymm13
3285#endif
3286
3287 vblendpd $0x3, %ymm14, %ymm15, %ymm14
3288
3289#if defined(OS_LINUX) | defined(OS_WINDOWS)
3290 vandpd .LC09(%rip), %ymm15, %ymm15
3291#elif defined(OS_MAC)
3292 vandpd LC09(%rip), %ymm15, %ymm15
3293#endif
3294
3295 jmp 3f
3296
32972:
3298
3299 // offset==3
3300
3301 vmovapd %ymm0, %ymm13
3302 vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
3303 vshufpd $0x5, %ymm12, %ymm4, %ymm0
3304 vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
3305 vshufpd $0x5, %ymm12, %ymm13, %ymm4
3306
3307 vmovapd %ymm1, %ymm13
3308 vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
3309 vshufpd $0x5, %ymm12, %ymm5, %ymm1
3310 vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
3311 vshufpd $0x5, %ymm12, %ymm13, %ymm5
3312
3313 vmovapd %ymm2, %ymm13
3314 vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
3315 vshufpd $0x5, %ymm12, %ymm6, %ymm2
3316 vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
3317 vshufpd $0x5, %ymm12, %ymm13, %ymm6
3318
3319 vmovapd %ymm3, %ymm13
3320 vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
3321 vshufpd $0x5, %ymm12, %ymm7, %ymm3
3322 vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
3323 vshufpd $0x5, %ymm12, %ymm13, %ymm7
3324
3325 vperm2f128 $0x01, %ymm8, %ymm8, %ymm12
3326 vshufpd $0x5, %ymm12, %ymm8, %ymm8
3327
3328 vperm2f128 $0x01, %ymm9, %ymm9, %ymm12
3329 vshufpd $0x5, %ymm12, %ymm9, %ymm9
3330
3331 vperm2f128 $0x01, %ymm10, %ymm10, %ymm12
3332 vshufpd $0x5, %ymm12, %ymm10, %ymm10
3333
3334 vperm2f128 $0x01, %ymm11, %ymm11, %ymm12
3335 vshufpd $0x5, %ymm12, %ymm11, %ymm11
3336
3337 vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
3338 vshufpd $0x5, %ymm12, %ymm14, %ymm14
3339 vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
3340 vshufpd $0x5, %ymm12, %ymm15, %ymm15
3341
3342#if defined(OS_LINUX) | defined(OS_WINDOWS)
3343 vandpd .LC10(%rip), %ymm14, %ymm12
3344 vandpd .LC07(%rip), %ymm15, %ymm13
3345#elif defined(OS_MAC)
3346 vandpd LC10(%rip), %ymm14, %ymm12
3347 vandpd LC07(%rip), %ymm15, %ymm13
3348#endif
3349
3350 vblendpd $0x7, %ymm14, %ymm15, %ymm14
3351
3352#if defined(OS_LINUX) | defined(OS_WINDOWS)
3353 vandpd .LC10(%rip), %ymm15, %ymm15
3354#elif defined(OS_MAC)
3355 vandpd LC10(%rip), %ymm15, %ymm15
3356#endif
3357
33583:
3359
3360 vmaskmovpd %ymm0, %ymm12, 0(%r11)
3361 vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
3362 vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
3363 vmaskmovpd %ymm1, %ymm12, 32(%r11)
3364 vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
3365 vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
3366 vmaskmovpd %ymm2, %ymm12, 64(%r11)
3367 vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
3368 vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
3369 vmaskmovpd %ymm3, %ymm12, 96(%r11)
3370 vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
3371 vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
3372
3373 vmaskmovpd %ymm8, %ymm15, 128(%r11, %r12, 1)
3374 vmaskmovpd %ymm8, %ymm13, 128(%r11, %r12, 2)
3375 cmpl $6, %r15d
3376 jl 4f // end
3377 vmaskmovpd %ymm9, %ymm15, 160(%r11, %r12, 1)
3378 vmaskmovpd %ymm9, %ymm13, 160(%r11, %r12, 2)
3379 cmpl $7, %r15d
3380 jl 4f // end
3381 vmaskmovpd %ymm10, %ymm15, 192(%r11, %r12, 1)
3382 vmaskmovpd %ymm10, %ymm13, 192(%r11, %r12, 2)
3383 je 4f // end
3384 vmaskmovpd %ymm11, %ymm15, 224(%r11, %r12, 1)
3385 vmaskmovpd %ymm11, %ymm13, 224(%r11, %r12, 2)
3386
33874:
3388
3389#if MACRO_LEVEL>=1
3390 .endm
3391#else
3392 ret
3393
3394#if defined(OS_LINUX)
3395 .size inner_store_8x8_gen_lib4, .-inner_store_8x8_gen_lib4
3396#endif
3397#endif
3398
3399
3400
3401
3402
3403// 1 2 3 4 5 6 7 8 9 10 11
3404// void kernel_dgemm_nt_8x8l_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
3405
3406 .p2align 4,,15
3407#if defined(OS_LINUX)
3408 .globl kernel_dgemm_nt_8x8l_lib4
3409 .type kernel_dgemm_nt_8x8l_lib4, @function
3410kernel_dgemm_nt_8x8l_lib4:
3411#elif defined(OS_MAC)
3412 .globl _kernel_dgemm_nt_8x8l_lib4
3413_kernel_dgemm_nt_8x8l_lib4:
3414#elif defined(OS_WINDOWS)
3415 .globl kernel_dgemm_nt_8x8l_lib4
3416 .def kernel_dgemm_nt_8x8l_lib4; .scl 2; .type 32; .endef
3417kernel_dgemm_nt_8x8l_lib4:
3418#endif
3419
3420 PROLOGUE
3421
3422 // zero accumulation registers
3423
3424 vxorpd %ymm0, %ymm0, %ymm0
3425 vmovapd %ymm0, %ymm1
3426 vmovapd %ymm0, %ymm2
3427 vmovapd %ymm0, %ymm3
3428 vmovapd %ymm0, %ymm4
3429 vmovapd %ymm0, %ymm5
3430 vmovapd %ymm0, %ymm6
3431 vmovapd %ymm0, %ymm7
3432 vmovapd %ymm0, %ymm8
3433 vmovapd %ymm0, %ymm9
3434 vmovapd %ymm0, %ymm10
3435 vmovapd %ymm0, %ymm11
3436
3437
3438 // call inner dgemm kernel nt
3439
3440 movq ARG1, %r10 // k
3441 movq ARG3, %r11 // A
3442 movq ARG4, %r12 // sda
3443 sall $5, %r12d // 4*sda*sizeof(double)
3444 movq ARG5, %r13 // B
3445 movq ARG6, %r14 // sdb
3446 sall $5, %r14d // 4*sdb*sizeof(double)
3447
3448#if MACRO_LEVEL>=2
3449 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
3450#else
3451#if defined(OS_LINUX) | defined(OS_WINDOWS)
3452 call inner_kernel_dgemm_add_nt_8x8_lib4
3453#elif defined(OS_MAC)
3454 callq _inner_kernel_dgemm_add_nt_8x8_lib4
3455#endif
3456#endif
3457
3458
3459 // call inner blend scale
3460
3461 movq ARG2, %r10 // alpha
3462 movq ARG7, %r11 // beta
3463 movq ARG8, %r12 // C
3464 movq ARG9, %r13 // sdc
3465 sall $5, %r13d // 4*sdc*sizeof(double)
3466
3467#if MACRO_LEVEL>=1
3468 INNER_SCALE_AB_8X8_LIB4
3469#else
3470#if defined(OS_LINUX) | defined(OS_WINDOWS)
3471 call inner_scale_ab_8x8_lib4
3472#elif defined(OS_MAC)
3473 callq _inner_scale_ab_8x8_lib4
3474#endif
3475#endif
3476
3477
3478 // store n
3479
3480 movq ARG10, %r10 // D
3481 movq ARG11, %r11 // sdd
3482 sall $5, %r11d // 4*sdd*sizeof(double)
3483
3484#if MACRO_LEVEL>=1
3485 INNER_STORE_8X8L_LIB4
3486#else
3487#if defined(OS_LINUX) | defined(OS_WINDOWS)
3488 call inner_store_8x8l_lib4
3489#elif defined(OS_MAC)
3490 callq _inner_store_8x8l_lib4
3491#endif
3492#endif
3493
3494
3495 EPILOGUE
3496
3497 ret
3498
3499#if defined(OS_LINUX)
3500 .size kernel_dgemm_nt_8x8l_lib4, .-kernel_dgemm_nt_8x8l_lib4
3501#endif
3502
3503
3504
3505
3506
3507// 1 2 3 4 5 6 7 8 9 10 11
3508// void kernel_dgemm_nt_8x8u_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
3509
3510 .p2align 4,,15
3511#if defined(OS_LINUX)
3512 .globl kernel_dgemm_nt_8x8u_lib4
3513 .type kernel_dgemm_nt_8x8u_lib4, @function
3514kernel_dgemm_nt_8x8u_lib4:
3515#elif defined(OS_MAC)
3516 .globl _kernel_dgemm_nt_8x8u_lib4
3517_kernel_dgemm_nt_8x8u_lib4:
3518#elif defined(OS_WINDOWS)
3519 .globl kernel_dgemm_nt_8x8u_lib4
3520 .def kernel_dgemm_nt_8x8u_lib4; .scl 2; .type 32; .endef
3521kernel_dgemm_nt_8x8u_lib4:
3522#endif
3523
3524 PROLOGUE
3525
3526 // zero accumulation registers
3527
3528 vxorpd %ymm0, %ymm0, %ymm0
3529 vmovapd %ymm0, %ymm1
3530 vmovapd %ymm0, %ymm2
3531 vmovapd %ymm0, %ymm3
3532 vmovapd %ymm0, %ymm4
3533 vmovapd %ymm0, %ymm5
3534 vmovapd %ymm0, %ymm6
3535 vmovapd %ymm0, %ymm7
3536 vmovapd %ymm0, %ymm8
3537 vmovapd %ymm0, %ymm9
3538 vmovapd %ymm0, %ymm10
3539 vmovapd %ymm0, %ymm11
3540
3541
3542 // call inner dgemm kernel nt
3543
3544 movq ARG1, %r10 // k
3545 movq ARG5, %r11 // B
3546 movq ARG6, %r12 // sdb
3547 sall $5, %r12d // 4*sda*sizeof(double)
3548 movq ARG3, %r13 // A
3549 movq ARG4, %r14 // sda
3550 sall $5, %r14d // 4*sdb*sizeof(double)
3551
3552#if MACRO_LEVEL>=2
3553 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
3554#else
3555#if defined(OS_LINUX) | defined(OS_WINDOWS)
3556 call inner_kernel_dgemm_add_nt_8x8_lib4
3557#elif defined(OS_MAC)
3558 callq _inner_kernel_dgemm_add_nt_8x8_lib4
3559#endif
3560#endif
3561
3562
3563 // call inner blend scale
3564
3565 movq ARG2, %r10 // alpha
3566 movq ARG7, %r11 // beta
3567 movq ARG8, %r12 // C
3568 movq ARG9, %r13 // sdc
3569 sall $5, %r13d // 4*sdc*sizeof(double)
3570
3571#if MACRO_LEVEL>=1
3572 INNER_TRAN_SCALE_AB_8X8_LIB4
3573#else
3574#if defined(OS_LINUX) | defined(OS_WINDOWS)
3575 call inner_tran_scale_ab_8x8_lib4
3576#elif defined(OS_MAC)
3577 callq _inner_tran_scale_ab_8x8_lib4
3578#endif
3579#endif
3580
3581
3582 // store n
3583
3584 movq ARG10, %r10 // D
3585 movq ARG11, %r11 // sdd
3586 sall $5, %r11d // 4*sdd*sizeof(double)
3587
3588#if MACRO_LEVEL>=1
3589 INNER_STORE_8X8U_LIB4
3590#else
3591#if defined(OS_LINUX) | defined(OS_WINDOWS)
3592 call inner_store_8x8u_lib4
3593#elif defined(OS_MAC)
3594 callq _inner_store_8x8u_lib4
3595#endif
3596#endif
3597
3598
3599 EPILOGUE
3600
3601 ret
3602
3603#if defined(OS_LINUX)
3604 .size kernel_dgemm_nt_8x8u_lib4, .-kernel_dgemm_nt_8x8u_lib4
3605#endif
3606
3607
3608
3609
3610
3611// 1 2 3 4 5 6 7 8 9 10 11 12 13
3612// void kernel_dgemm_nt_8x8l_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
3613
3614 .p2align 4,,15
3615#if defined(OS_LINUX)
3616 .globl kernel_dgemm_nt_8x8l_vs_lib4
3617 .type kernel_dgemm_nt_8x8l_vs_lib4, @function
3618kernel_dgemm_nt_8x8l_vs_lib4:
3619#elif defined(OS_MAC)
3620 .globl _kernel_dgemm_nt_8x8l_vs_lib4
3621_kernel_dgemm_nt_8x8l_vs_lib4:
3622#elif defined(OS_WINDOWS)
3623 .globl kernel_dgemm_nt_8x8l_vs_lib4
3624 .def kernel_dgemm_nt_8x8l_vs_lib4; .scl 2; .type 32; .endef
3625kernel_dgemm_nt_8x8l_vs_lib4:
3626#endif
3627
3628 PROLOGUE
3629
3630 // zero accumulation registers
3631
3632 vxorpd %ymm0, %ymm0, %ymm0
3633 vmovapd %ymm0, %ymm1
3634 vmovapd %ymm0, %ymm2
3635 vmovapd %ymm0, %ymm3
3636 vmovapd %ymm0, %ymm4
3637 vmovapd %ymm0, %ymm5
3638 vmovapd %ymm0, %ymm6
3639 vmovapd %ymm0, %ymm7
3640 vmovapd %ymm0, %ymm8
3641 vmovapd %ymm0, %ymm9
3642 vmovapd %ymm0, %ymm10
3643 vmovapd %ymm0, %ymm11
3644
3645
3646 // call inner dgemm kernel nt
3647
3648 movq ARG1, %r10 // k
3649 movq ARG3, %r11 // A
3650 movq ARG4, %r12 // sda
3651 sall $5, %r12d // 4*sda*sizeof(double)
3652 movq ARG5, %r13 // B
3653 movq ARG6, %r14 // sdb
3654 sall $5, %r14d // 4*sdb*sizeof(double)
3655
3656#if MACRO_LEVEL>=2
3657 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
3658#else
3659#if defined(OS_LINUX) | defined(OS_WINDOWS)
3660 call inner_kernel_dgemm_add_nt_8x8_lib4
3661#elif defined(OS_MAC)
3662 callq _inner_kernel_dgemm_add_nt_8x8_lib4
3663#endif
3664#endif
3665
3666
3667 // call inner blend scale
3668
3669 movq ARG2, %r10 // alpha
3670 movq ARG7, %r11 // beta
3671 movq ARG8, %r12 // C
3672 movq ARG9, %r13 // sdc
3673 sall $5, %r13d // 4*sdc*sizeof(double)
3674
3675#if MACRO_LEVEL>=1
3676 INNER_SCALE_AB_8X8_LIB4
3677#else
3678#if defined(OS_LINUX) | defined(OS_WINDOWS)
3679 call inner_scale_ab_8x8_lib4
3680#elif defined(OS_MAC)
3681 callq _inner_scale_ab_8x8_lib4
3682#endif
3683#endif
3684
3685
3686 // store n
3687
3688 movq ARG10, %r10 // D
3689 movq ARG11, %r11 // sdd
3690 sall $5, %r11d // 4*sdd*sizeof(double)
3691 movq ARG12, %r12 // km
3692 movq ARG13, %r13 // kn
3693
3694#if MACRO_LEVEL>=1
3695 INNER_STORE_8X8L_VS_LIB4
3696#else
3697#if defined(OS_LINUX) | defined(OS_WINDOWS)
3698 call inner_store_8x8l_vs_lib4
3699#elif defined(OS_MAC)
3700 callq _inner_store_8x8l_vs_lib4
3701#endif
3702#endif
3703
3704
3705 EPILOGUE
3706
3707 ret
3708
3709#if defined(OS_LINUX)
3710 .size kernel_dgemm_nt_8x8l_vs_lib4, .-kernel_dgemm_nt_8x8l_vs_lib4
3711#endif
3712
3713
3714
3715
3716
3717// 1 2 3 4 5 6 7 8 9 10 11 12 13
3718// void kernel_dgemm_nt_8x8u_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
3719
3720 .p2align 4,,15
3721#if defined(OS_LINUX)
3722 .globl kernel_dgemm_nt_8x8u_vs_lib4
3723 .type kernel_dgemm_nt_8x8u_vs_lib4, @function
3724kernel_dgemm_nt_8x8u_vs_lib4:
3725#elif defined(OS_MAC)
3726 .globl _kernel_dgemm_nt_8x8u_vs_lib4
3727_kernel_dgemm_nt_8x8u_vs_lib4:
3728#elif defined(OS_WINDOWS)
3729 .globl kernel_dgemm_nt_8x8u_vs_lib4
3730 .def kernel_dgemm_nt_8x8u_vs_lib4; .scl 2; .type 32; .endef
3731kernel_dgemm_nt_8x8u_vs_lib4:
3732#endif
3733
3734 PROLOGUE
3735
3736 // zero accumulation registers
3737
3738 vxorpd %ymm0, %ymm0, %ymm0
3739 vmovapd %ymm0, %ymm1
3740 vmovapd %ymm0, %ymm2
3741 vmovapd %ymm0, %ymm3
3742 vmovapd %ymm0, %ymm4
3743 vmovapd %ymm0, %ymm5
3744 vmovapd %ymm0, %ymm6
3745 vmovapd %ymm0, %ymm7
3746 vmovapd %ymm0, %ymm8
3747 vmovapd %ymm0, %ymm9
3748 vmovapd %ymm0, %ymm10
3749 vmovapd %ymm0, %ymm11
3750
3751
3752 // call inner dgemm kernel nt
3753
3754 movq ARG1, %r10 // k
3755 movq ARG5, %r11 // B
3756 movq ARG6, %r12 // sdb
3757 sall $5, %r12d // 4*sda*sizeof(double)
3758 movq ARG3, %r13 // A
3759 movq ARG4, %r14 // sdb
3760 sall $5, %r14d // 4*sdb*sizeof(double)
3761
3762#if MACRO_LEVEL>=2
3763 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
3764#else
3765#if defined(OS_LINUX) | defined(OS_WINDOWS)
3766 call inner_kernel_dgemm_add_nt_8x8_lib4
3767#elif defined(OS_MAC)
3768 callq _inner_kernel_dgemm_add_nt_8x8_lib4
3769#endif
3770#endif
3771
3772
3773 // call inner blend scale
3774
3775 movq ARG2, %r10 // alpha
3776 movq ARG7, %r11 // beta
3777 movq ARG8, %r12 // C
3778 movq ARG9, %r13 // sdc
3779 sall $5, %r13d // 4*sdc*sizeof(double)
3780
3781#if MACRO_LEVEL>=1
3782 INNER_TRAN_SCALE_AB_8X8_LIB4
3783#else
3784#if defined(OS_LINUX) | defined(OS_WINDOWS)
3785 call inner_tran_scale_ab_8x8_lib4
3786#elif defined(OS_MAC)
3787 callq _inner_tran_scale_ab_8x8_lib4
3788#endif
3789#endif
3790
3791
3792 // store n
3793
3794 movq ARG10, %r10 // D
3795 movq ARG11, %r11 // sdd
3796 sall $5, %r11d // 4*sdd*sizeof(double)
3797 movq ARG12, %r12 // km
3798 movq ARG13, %r13 // kn
3799
3800#if MACRO_LEVEL>=1
3801 INNER_STORE_8X8U_VS_LIB4
3802#else
3803#if defined(OS_LINUX) | defined(OS_WINDOWS)
3804 call inner_store_8x8u_vs_lib4
3805#elif defined(OS_MAC)
3806 callq _inner_store_8x8u_vs_lib4
3807#endif
3808#endif
3809
3810
3811 EPILOGUE
3812
3813 ret
3814
3815#if defined(OS_LINUX)
3816 .size kernel_dgemm_nt_8x8u_vs_lib4, .-kernel_dgemm_nt_8x8u_vs_lib4
3817#endif
3818
3819
3820
3821
3822
3823#if 0
3824// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
3825// void kernel_dgemm_nt_8x8_gen_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
3826
3827 .p2align 4,,15
3828#if defined(OS_LINUX)
3829 .globl kernel_dgemm_nt_8x8_gen_lib4
3830 .type kernel_dgemm_nt_8x8_gen_lib4, @function
3831kernel_dgemm_nt_8x8_gen_lib4:
3832#elif defined(OS_MAC)
3833 .globl _kernel_dgemm_nt_8x8_gen_lib4
3834_kernel_dgemm_nt_8x8_gen_lib4:
3835#elif defined(OS_WINDOWS)
3836 .globl kernel_dgemm_nt_8x8_gen_lib4
3837 .def kernel_dgemm_nt_8x8_gen_lib4; .scl 2; .type 32; .endef
3838kernel_dgemm_nt_8x8_gen_lib4:
3839#endif
3840
3841 PROLOGUE
3842
3843 // zero accumulation registers
3844
3845 vxorpd %ymm0, %ymm0, %ymm0
3846 vmovapd %ymm0, %ymm1
3847 vmovapd %ymm0, %ymm2
3848 vmovapd %ymm0, %ymm3
3849 vmovapd %ymm0, %ymm4
3850 vmovapd %ymm0, %ymm5
3851 vmovapd %ymm0, %ymm6
3852 vmovapd %ymm0, %ymm7
3853 vmovapd %ymm0, %ymm8
3854 vmovapd %ymm0, %ymm9
3855 vmovapd %ymm0, %ymm10
3856 vmovapd %ymm0, %ymm11
3857
3858
3859 // call inner dgemm kernel nt
3860
3861 movq ARG1, %r10 // k
3862 movq ARG3, %r11 // A
3863 movq ARG4, %r12 // sda
3864 sall $5, %r12d // 4*sda*sizeof(double)
3865 movq ARG5, %r13 // B
3866 movq ARG6, %r14 // sdb
3867 sall $5, %r14d // 4*sdb*sizeof(double)
3868
3869#if MACRO_LEVEL>=2
3870 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
3871#else
3872#if defined(OS_LINUX) | defined(OS_WINDOWS)
3873 call inner_kernel_dgemm_add_nt_8x8_lib4
3874#elif defined(OS_MAC)
3875 callq _inner_kernel_dgemm_add_nt_8x8_lib4
3876#endif
3877#endif
3878
3879
3880 // call inner blend scale
3881
3882 movq ARG2, %r10 // alpha
3883 movq ARG7, %r11 // beta
3884 movq ARG8, %r12 // C
3885 movq ARG9, %r13 // sdc
3886 sall $5, %r13d // 4*sdc*sizeof(double)
3887
3888#if MACRO_LEVEL>=1
3889 INNER_SCALE_AB_8X8_LIB4
3890#else
3891#if defined(OS_LINUX) | defined(OS_WINDOWS)
3892 call inner_scale_ab_8x8_lib4
3893#elif defined(OS_MAC)
3894 callq _inner_scale_ab_8x8_lib4
3895#endif
3896#endif
3897
3898
3899 // store n
3900
3901 movq ARG10, %r10 // D
3902 movq ARG11, %r11 // sdd
3903 sall $5, %r11d // 4*sdd*sizeof(double)
3904
3905#if MACRO_LEVEL>=1
3906 INNER_STORE_8X8_LIB4
3907#else
3908#if defined(OS_LINUX) | defined(OS_WINDOWS)
3909 call inner_store_8x8_lib4
3910#elif defined(OS_MAC)
3911 callq _inner_store_8x8_lib4
3912#endif
3913#endif
3914
3915
3916 EPILOGUE
3917
3918 ret
3919
3920#if defined(OS_LINUX)
3921 .size kernel_dgemm_nt_8x8_lib4, .-kernel_dgemm_nt_8x8_lib4
3922#endif
3923#endif
3924
3925
3926
3927
3928
3929// 1 2 3 4 5 6 7 8 9 10 11
3930// void kernel_dsyrk_nt_8x8_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
3931
3932 .p2align 4,,15
3933#if defined(OS_LINUX)
3934 .globl kernel_dsyrk_nt_l_8x8_lib4
3935 .type kernel_dsyrk_nt_l_8x8_lib4, @function
3936kernel_dsyrk_nt_l_8x8_lib4:
3937#elif defined(OS_MAC)
3938 .globl _kernel_dsyrk_nt_l_8x8_lib4
3939_kernel_dsyrk_nt_l_8x8_lib4:
3940#elif defined(OS_WINDOWS)
3941 .globl kernel_dsyrk_nt_l_8x8_lib4
3942 .def kernel_dsyrk_nt_l_8x8_lib4; .scl 2; .type 32; .endef
3943kernel_dsyrk_nt_l_8x8_lib4:
3944#endif
3945
3946 PROLOGUE
3947
3948 // zero accumulation registers
3949
3950 vxorpd %ymm0, %ymm0, %ymm0
3951 vmovapd %ymm0, %ymm1
3952 vmovapd %ymm0, %ymm2
3953 vmovapd %ymm0, %ymm3
3954 vmovapd %ymm0, %ymm4
3955 vmovapd %ymm0, %ymm5
3956 vmovapd %ymm0, %ymm6
3957 vmovapd %ymm0, %ymm7
3958 vmovapd %ymm0, %ymm8
3959 vmovapd %ymm0, %ymm9
3960 vmovapd %ymm0, %ymm10
3961 vmovapd %ymm0, %ymm11
3962
3963
3964 // call inner dgemm kernel nt
3965
3966 movq ARG1, %r10 // k
3967 movq ARG3, %r11 // A
3968 movq ARG4, %r12 // sda
3969 sall $5, %r12d // 4*sda*sizeof(double)
3970 movq ARG5, %r13 // B
3971 movq ARG6, %r14 // sdb
3972 sall $5, %r14d // 4*sdb*sizeof(double)
3973
3974#if MACRO_LEVEL>=2
3975 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
3976#else
3977#if defined(OS_LINUX) | defined(OS_WINDOWS)
3978 call inner_kernel_dgemm_add_nt_8x8_lib4
3979#elif defined(OS_MAC)
3980 callq _inner_kernel_dgemm_add_nt_8x8_lib4
3981#endif
3982#endif
3983
3984
3985 // call inner blend scale
3986
3987 movq ARG2, %r10 // alpha
3988 movq ARG7, %r11 // beta
3989 movq ARG8, %r12 // C
3990 movq ARG9, %r13 // sdc
3991 sall $5, %r13d // 4*sdc*sizeof(double)
3992
3993#if MACRO_LEVEL>=1
3994 INNER_SCALE_AB_8X8_LIB4
3995#else
3996#if defined(OS_LINUX) | defined(OS_WINDOWS)
3997 call inner_scale_ab_8x8_lib4
3998#elif defined(OS_MAC)
3999 callq _inner_scale_ab_8x8_lib4
4000#endif
4001#endif
4002
4003
4004 // store n
4005
4006 movq ARG10, %r10 // D
4007 movq ARG11, %r11 // sdd
4008 sall $5, %r11d // 4*sdd*sizeof(double)
4009
4010#if MACRO_LEVEL>=1
4011 INNER_STORE_L_8X8_LIB4
4012#else
4013#if defined(OS_LINUX) | defined(OS_WINDOWS)
4014 call inner_store_l_8x8_lib4
4015#elif defined(OS_MAC)
4016 callq _inner_store_l_8x8_lib4
4017#endif
4018#endif
4019
4020
4021 EPILOGUE
4022
4023 ret
4024
4025#if defined(OS_LINUX)
4026 .size kernel_dsyrk_nt_l_8x8_lib4, .-kernel_dsyrk_nt_l_8x8_lib4
4027#endif
4028
4029
4030
4031
4032
4033
4034// 1 2 3 4 5 6 7 8 9 10 11 12 13
4035// void kernel_dsyrk_nt_8x8_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
4036
4037 .p2align 4,,15
4038#if defined(OS_LINUX)
4039 .globl kernel_dsyrk_nt_l_8x8_vs_lib4
4040 .type kernel_dsyrk_nt_l_8x8_vs_lib4, @function
4041kernel_dsyrk_nt_l_8x8_vs_lib4:
4042#elif defined(OS_MAC)
4043 .globl _kernel_dsyrk_nt_l_8x8_vs_lib4
4044_kernel_dsyrk_nt_l_8x8_vs_lib4:
4045#elif defined(OS_WINDOWS)
4046 .globl kernel_dsyrk_nt_l_8x8_vs_lib4
4047 .def kernel_dsyrk_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
4048kernel_dsyrk_nt_l_8x8_vs_lib4:
4049#endif
4050
4051 PROLOGUE
4052
4053 // zero accumulation registers
4054
4055 vxorpd %ymm0, %ymm0, %ymm0
4056 vmovapd %ymm0, %ymm1
4057 vmovapd %ymm0, %ymm2
4058 vmovapd %ymm0, %ymm3
4059 vmovapd %ymm0, %ymm4
4060 vmovapd %ymm0, %ymm5
4061 vmovapd %ymm0, %ymm6
4062 vmovapd %ymm0, %ymm7
4063 vmovapd %ymm0, %ymm8
4064 vmovapd %ymm0, %ymm9
4065 vmovapd %ymm0, %ymm10
4066 vmovapd %ymm0, %ymm11
4067
4068
4069 // call inner dgemm kernel nt
4070
4071 movq ARG1, %r10 // k
4072 movq ARG3, %r11 // A
4073 movq ARG4, %r12 // sda
4074 sall $5, %r12d // 4*sda*sizeof(double)
4075 movq ARG5, %r13 // B
4076 movq ARG6, %r14 // sdb
4077 sall $5, %r14d // 4*sdb*sizeof(double)
4078
4079#if MACRO_LEVEL>=2
4080 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
4081#else
4082#if defined(OS_LINUX) | defined(OS_WINDOWS)
4083 call inner_kernel_dgemm_add_nt_8x8_lib4
4084#elif defined(OS_MAC)
4085 callq _inner_kernel_dgemm_add_nt_8x8_lib4
4086#endif
4087#endif
4088
4089
4090 // call inner blend scale
4091
4092 movq ARG2, %r10 // alpha
4093 movq ARG7, %r11 // beta
4094 movq ARG8, %r12 // C
4095 movq ARG9, %r13 // sdc
4096 sall $5, %r13d // 4*sdc*sizeof(double)
4097
4098#if MACRO_LEVEL>=1
4099 INNER_SCALE_AB_8X8_LIB4
4100#else
4101#if defined(OS_LINUX) | defined(OS_WINDOWS)
4102 call inner_scale_ab_8x8_lib4
4103#elif defined(OS_MAC)
4104 callq _inner_scale_ab_8x8_lib4
4105#endif
4106#endif
4107
4108
4109 // store n
4110
4111 movq ARG10, %r10 // D
4112 movq ARG11, %r11 // sdd
4113 sall $5, %r11d // 4*sdd*sizeof(double)
4114 movq ARG12, %r12 // D
4115 movq ARG13, %r13 // D
4116
4117#if MACRO_LEVEL>=1
4118 INNER_STORE_L_8X8_VS_LIB4
4119#else
4120#if defined(OS_LINUX) | defined(OS_WINDOWS)
4121 call inner_store_l_8x8_vs_lib4
4122#elif defined(OS_MAC)
4123 callq _inner_store_l_8x8_vs_lib4
4124#endif
4125#endif
4126
4127
4128 EPILOGUE
4129
4130 ret
4131
4132#if defined(OS_LINUX)
4133 .size kernel_dsyrk_nt_l_8x8_vs_lib4, .-kernel_dsyrk_nt_l_8x8_vs_lib4
4134#endif
4135
4136
4137
4138
4139
4140
4141// 1 2 3 4 5 6 7 8 9 10
4142// void kernel_dpotrf_nt_l_8x8_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
4143
4144 .p2align 4,,15
4145#if defined(OS_LINUX)
4146 .globl kernel_dpotrf_nt_l_8x8_lib4
4147 .type kernel_dpotrf_nt_l_8x8_lib4, @function
4148kernel_dpotrf_nt_l_8x8_lib4:
4149#elif defined(OS_MAC)
4150 .globl _kernel_dpotrf_nt_l_8x8_lib4
4151_kernel_dpotrf_nt_l_8x8_lib4:
4152#elif defined(OS_WINDOWS)
4153 .globl kernel_dpotrf_nt_l_8x8_lib4
4154 .def kernel_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
4155kernel_dpotrf_nt_l_8x8_lib4:
4156#endif
4157
4158 PROLOGUE
4159
4160 // zero accumulation registers
4161
4162 vxorpd %ymm0, %ymm0, %ymm0
4163 vmovapd %ymm0, %ymm1
4164 vmovapd %ymm0, %ymm2
4165 vmovapd %ymm0, %ymm3
4166 vmovapd %ymm0, %ymm4
4167 vmovapd %ymm0, %ymm5
4168 vmovapd %ymm0, %ymm6
4169 vmovapd %ymm0, %ymm7
4170 vmovapd %ymm0, %ymm8
4171 vmovapd %ymm0, %ymm9
4172 vmovapd %ymm0, %ymm10
4173 vmovapd %ymm0, %ymm11
4174
4175
4176 // call inner dgemm kernel nt
4177
4178 movq ARG1, %r10 // k
4179 movq ARG2, %r11 // A
4180 movq ARG3, %r12 // sda
4181 sall $5, %r12d // 4*sda*sizeof(double)
4182 movq ARG4, %r13 // B
4183 movq ARG5, %r14 // sdb
4184 sall $5, %r14d // 4*sdb*sizeof(double)
4185
4186#if MACRO_LEVEL>=2
4187 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
4188#else
4189#if defined(OS_LINUX) | defined(OS_WINDOWS)
4190 call inner_kernel_dgemm_sub_nt_8x8_lib4
4191#elif defined(OS_MAC)
4192 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
4193#endif
4194#endif
4195
4196
4197 // call inner blender nn
4198
4199 movq ARG6, %r10 // C
4200 movq ARG7, %r11 // sdc
4201 sall $5, %r11d // 4*sdc*sizeof(double)
4202
4203#if MACRO_LEVEL>=1
4204 INNER_SCALE_11_8X8_LIB4
4205#else
4206#if defined(OS_LINUX) | defined(OS_WINDOWS)
4207 call inner_scale_11_8x8_lib4
4208#elif defined(OS_MAC)
4209 callq _inner_scale_11_8x8_lib4
4210#endif
4211#endif
4212
4213
4214 // factorization
4215
4216 movq ARG10, %r10 // inv_diag_D
4217 movl $8, %r11d
4218
4219#if MACRO_LEVEL>=1
4220 INNER_EDGE_DPOTRF_8X8_VS_LIB4
4221#else
4222#if defined(OS_LINUX) | defined(OS_WINDOWS)
4223 call inner_edge_dpotrf_8x8_vs_lib4
4224#elif defined(OS_MAC)
4225 callq _inner_edge_dpotrf_8x8_vs_lib4
4226#endif
4227#endif
4228
4229
4230 // store n
4231
4232 movq ARG8, %r10 // store address D
4233 movq ARG9, %r11 // sdd
4234 sall $5, %r11d // 4*sdd*sizeof(double)
4235
4236#if MACRO_LEVEL>=1
4237 INNER_STORE_L_8X8_LIB4
4238#else
4239#if defined(OS_LINUX) | defined(OS_WINDOWS)
4240 call inner_store_l_8x8_lib4
4241#elif defined(OS_MAC)
4242 callq _inner_store_l_8x8_lib4
4243#endif
4244#endif
4245
4246
4247 EPILOGUE
4248
4249 ret
4250
4251#if defined(OS_LINUX)
4252 .size kernel_dpotrf_nt_l_8x8_lib4, .-kernel_dpotrf_nt_l_8x8_lib4
4253#endif
4254
4255
4256
4257
4258
4259// 1 2 3 4 5 6 7 8 9 10 11 12
4260// void kernel_dpotrf_nt_l_8x8_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
4261
4262 .p2align 4,,15
4263#if defined(OS_LINUX)
4264 .globl kernel_dpotrf_nt_l_8x8_vs_lib4
4265 .type kernel_dpotrf_nt_l_8x8_vs_lib4, @function
4266kernel_dpotrf_nt_l_8x8_vs_lib4:
4267#elif defined(OS_MAC)
4268 .globl _kernel_dpotrf_nt_l_8x8_vs_lib4
4269_kernel_dpotrf_nt_l_8x8_vs_lib4:
4270#elif defined(OS_WINDOWS)
4271 .globl kernel_dpotrf_nt_l_8x8_vs_lib4
4272 .def kernel_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
4273kernel_dpotrf_nt_l_8x8_vs_lib4:
4274#endif
4275
4276 PROLOGUE
4277
4278 // zero accumulation registers
4279
4280 vxorpd %ymm0, %ymm0, %ymm0
4281 vmovapd %ymm0, %ymm1
4282 vmovapd %ymm0, %ymm2
4283 vmovapd %ymm0, %ymm3
4284 vmovapd %ymm0, %ymm4
4285 vmovapd %ymm0, %ymm5
4286 vmovapd %ymm0, %ymm6
4287 vmovapd %ymm0, %ymm7
4288 vmovapd %ymm0, %ymm8
4289 vmovapd %ymm0, %ymm9
4290 vmovapd %ymm0, %ymm10
4291 vmovapd %ymm0, %ymm11
4292
4293
4294 // call inner dgemm kernel nt
4295
4296 movq ARG1, %r10 // k
4297 movq ARG2, %r11 // A
4298 movq ARG3, %r12 // sda
4299 sall $5, %r12d // 4*sda*sizeof(double)
4300 movq ARG4, %r13 // B
4301 movq ARG5, %r14 // sdb
4302 sall $5, %r14d // 4*sdb*sizeof(double)
4303
4304#if MACRO_LEVEL>=2
4305 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
4306#else
4307#if defined(OS_LINUX) | defined(OS_WINDOWS)
4308 call inner_kernel_dgemm_sub_nt_8x8_lib4
4309#elif defined(OS_MAC)
4310 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
4311#endif
4312#endif
4313
4314
4315 // call inner blender nn
4316
4317 movq ARG6, %r10 // C
4318 movq ARG7, %r11 // sdc
4319 sall $5, %r11d // 4*sdc*sizeof(double)
4320
4321#if MACRO_LEVEL>=1
4322 INNER_SCALE_11_8X8_LIB4
4323#else
4324#if defined(OS_LINUX) | defined(OS_WINDOWS)
4325 call inner_scale_11_8x8_lib4
4326#elif defined(OS_MAC)
4327 callq _inner_scale_11_8x8_lib4
4328#endif
4329#endif
4330
4331
4332 // factorization
4333
4334 movq ARG10, %r10 // inv_diag_D
4335 movq ARG12, %r11 // kn
4336
4337#if MACRO_LEVEL>=1
4338 INNER_EDGE_DPOTRF_8X8_VS_LIB4
4339#else
4340#if defined(OS_LINUX) | defined(OS_WINDOWS)
4341 call inner_edge_dpotrf_8x8_vs_lib4
4342#elif defined(OS_MAC)
4343 callq _inner_edge_dpotrf_8x8_vs_lib4
4344#endif
4345#endif
4346
4347
4348 // store n
4349
4350 movq ARG8, %r10 // store address D
4351 movq ARG9, %r11 // sdd
4352 sall $5, %r11d // 4*sdd*sizeof(double)
4353
4354 movq ARG11, %r12 // km
4355 movq ARG12, %r13 // kn
4356
4357#if MACRO_LEVEL>=1
4358 INNER_STORE_L_8X8_VS_LIB4
4359#else
4360#if defined(OS_LINUX) | defined(OS_WINDOWS)
4361 call inner_store_l_8x8_vs_lib4
4362#elif defined(OS_MAC)
4363 callq _inner_store_l_8x8_vs_lib4
4364#endif
4365#endif
4366
4367
4368 EPILOGUE
4369
4370 ret
4371
4372#if defined(OS_LINUX)
4373 .size kernel_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dpotrf_nt_l_8x8_vs_lib4
4374#endif
4375
4376
4377
4378
4379
4380// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
4381// void kernel_dsyrk_dpotrf_nt_l_8x8_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
4382
4383 .p2align 4,,15
4384#if defined(OS_LINUX)
4385 .globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
4386 .type kernel_dsyrk_dpotrf_nt_l_8x8_lib4, @function
4387kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
4388#elif defined(OS_MAC)
4389 .globl _kernel_dsyrk_dpotrf_nt_l_8x8_lib4
4390_kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
4391#elif defined(OS_WINDOWS)
4392 .globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
4393 .def kernel_dsyrk_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
4394kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
4395#endif
4396
4397 PROLOGUE
4398
4399 // zero accumulation registers
4400
4401 vxorpd %ymm0, %ymm0, %ymm0
4402 vmovapd %ymm0, %ymm1
4403 vmovapd %ymm0, %ymm2
4404 vmovapd %ymm0, %ymm3
4405 vmovapd %ymm0, %ymm4
4406 vmovapd %ymm0, %ymm5
4407 vmovapd %ymm0, %ymm6
4408 vmovapd %ymm0, %ymm7
4409 vmovapd %ymm0, %ymm8
4410 vmovapd %ymm0, %ymm9
4411 vmovapd %ymm0, %ymm10
4412 vmovapd %ymm0, %ymm11
4413
4414
4415 // call inner dgemm kernel nt add
4416
4417 movq ARG1, %r10 // kp
4418 movq ARG2, %r11 // Ap
4419 movq ARG3, %r12 // sdap
4420 sall $5, %r12d // 4*sdap*sizeof(double)
4421 movq ARG4, %r13 // Bp
4422 movq ARG5, %r14 // sdbp
4423 sall $5, %r14d // 4*sdbp*sizeof(double)
4424
4425#if MACRO_LEVEL>=2
4426 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
4427#else
4428#if defined(OS_LINUX) | defined(OS_WINDOWS)
4429 call inner_kernel_dgemm_add_nt_8x8_lib4
4430#elif defined(OS_MAC)
4431 callq _inner_kernel_dgemm_add_nt_8x8_lib4
4432#endif
4433#endif
4434
4435
4436 movq ARG6, %r10 // km
4437 movq ARG7, %r11 // Am
4438 movq ARG8, %r12 // sdam
4439 sall $5, %r12d // 4*sdam*sizeof(double)
4440 movq ARG9, %r13 // Bm
4441 movq ARG10, %r14 // sdbm
4442 sall $5, %r14d // 4*sdbm*sizeof(double)
4443
4444#if MACRO_LEVEL>=2
4445 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
4446#else
4447#if defined(OS_LINUX) | defined(OS_WINDOWS)
4448 call inner_kernel_dgemm_sub_nt_8x8_lib4
4449#elif defined(OS_MAC)
4450 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
4451#endif
4452#endif
4453
4454
4455 // call inner blender nn
4456
4457 movq ARG11, %r10 // C
4458 movq ARG12, %r11 // sdc
4459 sall $5, %r11d // 4*sdc*sizeof(double)
4460
4461#if MACRO_LEVEL>=1
4462 INNER_SCALE_11_8X8_LIB4
4463#else
4464#if defined(OS_LINUX) | defined(OS_WINDOWS)
4465 call inner_scale_11_8x8_lib4
4466#elif defined(OS_MAC)
4467 callq _inner_scale_11_8x8_lib4
4468#endif
4469#endif
4470
4471
4472 // factorization
4473
4474 movq ARG15, %r10 // inv_diag_D
4475 movl $8, %r11d
4476
4477#if MACRO_LEVEL>=1
4478 INNER_EDGE_DPOTRF_8X8_VS_LIB4
4479#else
4480#if defined(OS_LINUX) | defined(OS_WINDOWS)
4481 call inner_edge_dpotrf_8x8_vs_lib4
4482#elif defined(OS_MAC)
4483 callq _inner_edge_dpotrf_8x8_vs_lib4
4484#endif
4485#endif
4486
4487
4488 // store n
4489
4490 movq ARG13, %r10 // store address D
4491 movq ARG14, %r11 // sdd
4492 sall $5, %r11d // 4*sdd*sizeof(double)
4493
4494#if MACRO_LEVEL>=1
4495 INNER_STORE_L_8X8_LIB4
4496#else
4497#if defined(OS_LINUX) | defined(OS_WINDOWS)
4498 call inner_store_l_8x8_lib4
4499#elif defined(OS_MAC)
4500 callq _inner_store_l_8x8_lib4
4501#endif
4502#endif
4503
4504
4505 EPILOGUE
4506
4507 ret
4508
4509#if defined(OS_LINUX)
4510 .size kernel_dsyrk_dpotrf_nt_l_8x8_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_lib4
4511#endif
4512
4513
4514
4515
4516
4517// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
4518// void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
4519
4520 .p2align 4,,15
4521#if defined(OS_LINUX)
4522 .globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
4523 .type kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, @function
4524kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
4525#elif defined(OS_MAC)
4526 .globl _kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
4527_kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
4528#elif defined(OS_WINDOWS)
4529 .globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
4530 .def kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
4531kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
4532#endif
4533
4534 PROLOGUE
4535
4536 // zero accumulation registers
4537
4538 vxorpd %ymm0, %ymm0, %ymm0
4539 vmovapd %ymm0, %ymm1
4540 vmovapd %ymm0, %ymm2
4541 vmovapd %ymm0, %ymm3
4542 vmovapd %ymm0, %ymm4
4543 vmovapd %ymm0, %ymm5
4544 vmovapd %ymm0, %ymm6
4545 vmovapd %ymm0, %ymm7
4546 vmovapd %ymm0, %ymm8
4547 vmovapd %ymm0, %ymm9
4548 vmovapd %ymm0, %ymm10
4549 vmovapd %ymm0, %ymm11
4550
4551
4552 // call inner dgemm kernel nt add
4553
4554 movq ARG1, %r10 // kp
4555 movq ARG2, %r11 // Ap
4556 movq ARG3, %r12 // sdap
4557 sall $5, %r12d // 4*sdap*sizeof(double)
4558 movq ARG4, %r13 // Bp
4559 movq ARG5, %r14 // sdbp
4560 sall $5, %r14d // 4*sdbp*sizeof(double)
4561
4562#if MACRO_LEVEL>=2
4563 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
4564#else
4565#if defined(OS_LINUX) | defined(OS_WINDOWS)
4566 call inner_kernel_dgemm_add_nt_8x8_lib4
4567#elif defined(OS_MAC)
4568 callq _inner_kernel_dgemm_add_nt_8x8_lib4
4569#endif
4570#endif
4571
4572 // call inner dgemm kernel nt sub
4573
4574 movq ARG6, %r10 // km
4575 movq ARG7, %r11 // Am
4576 movq ARG8, %r12 // sdam
4577 sall $5, %r12d // 4*sdam*sizeof(double)
4578 movq ARG9, %r13 // Bm
4579 movq ARG10, %r14 // sdbm
4580 sall $5, %r14d // 4*sdbm*sizeof(double)
4581
4582#if MACRO_LEVEL>=2
4583 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
4584#else
4585#if defined(OS_LINUX) | defined(OS_WINDOWS)
4586 call inner_kernel_dgemm_sub_nt_8x8_lib4
4587#elif defined(OS_MAC)
4588 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
4589#endif
4590#endif
4591
4592
4593 // call inner blender nn
4594
4595 movq ARG11, %r10 // C
4596 movq ARG12, %r11 // sdc
4597 sall $5, %r11d // 4*sdc*sizeof(double)
4598
4599#if MACRO_LEVEL>=1
4600 INNER_SCALE_11_8X8_LIB4
4601#else
4602#if defined(OS_LINUX) | defined(OS_WINDOWS)
4603 call inner_scale_11_8x8_lib4
4604#elif defined(OS_MAC)
4605 callq _inner_scale_11_8x8_lib4
4606#endif
4607#endif
4608
4609
4610 // factorization
4611
4612 movq ARG15, %r10 // inv_diag_D
4613 movq ARG17, %r11 // kn
4614
4615#if MACRO_LEVEL>=1
4616 INNER_EDGE_DPOTRF_8X8_VS_LIB4
4617#else
4618#if defined(OS_LINUX) | defined(OS_WINDOWS)
4619 call inner_edge_dpotrf_8x8_vs_lib4
4620#elif defined(OS_MAC)
4621 callq _inner_edge_dpotrf_8x8_vs_lib4
4622#endif
4623#endif
4624
4625
4626 // store n
4627
4628 movq ARG13, %r10 // store address D
4629 movq ARG14, %r11 // sdd
4630 sall $5, %r11d // 4*sdd*sizeof(double)
4631
4632 movq ARG16, %r12 // km
4633 movq ARG17, %r13 // kn
4634
4635#if MACRO_LEVEL>=1
4636 INNER_STORE_L_8X8_VS_LIB4
4637#else
4638#if defined(OS_LINUX) | defined(OS_WINDOWS)
4639 call inner_store_l_8x8_vs_lib4
4640#elif defined(OS_MAC)
4641 callq _inner_store_l_8x8_vs_lib4
4642#endif
4643#endif
4644
4645
4646 EPILOGUE
4647
4648 ret
4649
4650#if defined(OS_LINUX)
4651 .size kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
4652#endif
4653
4654
4655
4656
4657
4658// 1 2 3 4 5 6 7 8 9 10 11 12
4659// void kernel_dtrsm_nt_rl_inv_8x8l_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
4660
4661 .p2align 4,,15
4662#if defined(OS_LINUX)
4663 .globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
4664 .type kernel_dtrsm_nt_rl_inv_8x8l_lib4, @function
4665kernel_dtrsm_nt_rl_inv_8x8l_lib4:
4666#elif defined(OS_MAC)
4667 .globl _kernel_dtrsm_nt_rl_inv_8x8l_lib4
4668_kernel_dtrsm_nt_rl_inv_8x8l_lib4:
4669#elif defined(OS_WINDOWS)
4670 .globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
4671 .def kernel_dtrsm_nt_rl_inv_8x8l_lib4; .scl 2; .type 32; .endef
4672kernel_dtrsm_nt_rl_inv_8x8l_lib4:
4673#endif
4674
4675 PROLOGUE
4676
4677 // zero accumulation registers
4678
4679 vxorpd %ymm0, %ymm0, %ymm0
4680 vmovapd %ymm0, %ymm1
4681 vmovapd %ymm0, %ymm2
4682 vmovapd %ymm0, %ymm3
4683 vmovapd %ymm0, %ymm4
4684 vmovapd %ymm0, %ymm5
4685 vmovapd %ymm0, %ymm6
4686 vmovapd %ymm0, %ymm7
4687 vmovapd %ymm0, %ymm8
4688 vmovapd %ymm0, %ymm9
4689 vmovapd %ymm0, %ymm10
4690 vmovapd %ymm0, %ymm11
4691
4692
4693 // call inner dgemm kernel nt
4694
4695 movq ARG1, %r10
4696 movq ARG2, %r11
4697 movq ARG3, %r12
4698 sall $5, %r12d // 4*sda*sizeof(double)
4699 movq ARG4, %r13
4700 movq ARG5, %r14
4701 sall $5, %r14d // 4*sdb*sizeof(double)
4702
4703#if MACRO_LEVEL>=2
4704 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
4705#else
4706#if defined(OS_LINUX) | defined(OS_WINDOWS)
4707 call inner_kernel_dgemm_sub_nt_8x8_lib4
4708#elif defined(OS_MAC)
4709 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
4710#endif
4711#endif
4712
4713
4714 // call inner blender nn
4715
4716 movq ARG6, %r10
4717 movq ARG7, %r11 // sdc
4718 sall $5, %r11d // 4*sdc*sizeof(double)
4719
4720#if MACRO_LEVEL>=1
4721 INNER_SCALE_11_8X8_LIB4
4722#else
4723#if defined(OS_LINUX) | defined(OS_WINDOWS)
4724 call inner_scale_11_8x8_lib4
4725#elif defined(OS_MAC)
4726 callq _inner_scale_11_8x8_lib4
4727#endif
4728#endif
4729
4730
4731 // solve
4732
4733 movq ARG10, %r10 // E
4734 movq ARG11, %r11 // sde
4735 sall $5, %r11d // 4*sde*sizeof(double)
4736 movq ARG12, %r12 // inv_diag_E
4737
4738#if MACRO_LEVEL>=1
4739 INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
4740#else
4741#if defined(OS_LINUX) | defined(OS_WINDOWS)
4742 call inner_edge_dtrsm_rlt_inv_8x8l_lib4
4743#elif defined(OS_MAC)
4744 callq _inner_edge_dtrsm_rlt_inv_8x8l_lib4
4745#endif
4746#endif
4747
4748
4749 // store n
4750
4751 movq ARG8, %r10 // store address D
4752 movq ARG9, %r11 // sdd
4753 sall $5, %r11d // 4*sdd*sizeof(double)
4754
4755#if MACRO_LEVEL>=1
4756 INNER_STORE_8X8L_LIB4
4757#else
4758#if defined(OS_LINUX) | defined(OS_WINDOWS)
4759 call inner_store_8x8l_lib4
4760#elif defined(OS_MAC)
4761 callq _inner_store_8x8l_lib4
4762#endif
4763#endif
4764
4765
4766 EPILOGUE
4767
4768 ret
4769
4770#if defined(OS_LINUX)
4771 .size kernel_dtrsm_nt_rl_inv_8x8l_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_lib4
4772#endif
4773
4774
4775
4776
4777
4778// 1 2 3 4 5 6 7 8 9 10 11 12
4779// void kernel_dtrsm_nt_rl_inv_8x8u_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
4780
4781 .p2align 4,,15
4782#if defined(OS_LINUX)
4783 .globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
4784 .type kernel_dtrsm_nt_rl_inv_8x8u_lib4, @function
4785kernel_dtrsm_nt_rl_inv_8x8u_lib4:
4786#elif defined(OS_MAC)
4787 .globl _kernel_dtrsm_nt_rl_inv_8x8u_lib4
4788_kernel_dtrsm_nt_rl_inv_8x8u_lib4:
4789#elif defined(OS_WINDOWS)
4790 .globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
4791 .def kernel_dtrsm_nt_rl_inv_8x8u_lib4; .scl 2; .type 32; .endef
4792kernel_dtrsm_nt_rl_inv_8x8u_lib4:
4793#endif
4794
4795 PROLOGUE
4796
4797 // zero accumulation registers
4798
4799 vxorpd %ymm0, %ymm0, %ymm0
4800 vmovapd %ymm0, %ymm1
4801 vmovapd %ymm0, %ymm2
4802 vmovapd %ymm0, %ymm3
4803 vmovapd %ymm0, %ymm4
4804 vmovapd %ymm0, %ymm5
4805 vmovapd %ymm0, %ymm6
4806 vmovapd %ymm0, %ymm7
4807 vmovapd %ymm0, %ymm8
4808 vmovapd %ymm0, %ymm9
4809 vmovapd %ymm0, %ymm10
4810 vmovapd %ymm0, %ymm11
4811
4812
4813 // call inner dgemm kernel nt
4814
4815 movq ARG1, %r10
4816 movq ARG4, %r11
4817 movq ARG5, %r12
4818 sall $5, %r12d // 4*sda*sizeof(double)
4819 movq ARG2, %r13
4820 movq ARG3, %r14
4821 sall $5, %r14d // 4*sdb*sizeof(double)
4822
4823#if MACRO_LEVEL>=2
4824 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
4825#else
4826#if defined(OS_LINUX) | defined(OS_WINDOWS)
4827 call inner_kernel_dgemm_sub_nt_8x8_lib4
4828#elif defined(OS_MAC)
4829 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
4830#endif
4831#endif
4832
4833
4834 // call inner blender nn
4835
4836 movq ARG6, %r10
4837 movq ARG7, %r11 // sdc
4838 sall $5, %r11d // 4*sdc*sizeof(double)
4839
4840#if MACRO_LEVEL>=1
4841 INNER_TRAN_SCALE_11_8X8_LIB4
4842#else
4843#if defined(OS_LINUX) | defined(OS_WINDOWS)
4844 call inner_tran_scale_11_8x8_lib4
4845#elif defined(OS_MAC)
4846 callq _inner_tran_scale_11_8x8_lib4
4847#endif
4848#endif
4849
4850
4851 // solve
4852
4853 movq ARG10, %r10 // E
4854 movq ARG11, %r11 // sde
4855 sall $5, %r11d // 4*sde*sizeof(double)
4856 movq ARG12, %r12 // inv_diag_E
4857 movq ARG8, %r13 // D
4858 movq ARG9, %r14 // sdd
4859
4860#if MACRO_LEVEL>=1
4861 INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
4862#else
4863#if defined(OS_LINUX) | defined(OS_WINDOWS)
4864 call inner_edge_dtrsm_rlt_inv_8x8u_lib4
4865#elif defined(OS_MAC)
4866 callq _inner_edge_dtrsm_rlt_inv_8x8u_lib4
4867#endif
4868#endif
4869
4870
4871 // store n
4872
4873 movq ARG8, %r10 // store address D
4874 movq ARG9, %r11 // sdd
4875 sall $5, %r11d // 4*sdd*sizeof(double)
4876
4877#if MACRO_LEVEL>=1
4878 INNER_STORE_8X8U_LIB4
4879#else
4880#if defined(OS_LINUX) | defined(OS_WINDOWS)
4881 call inner_store_8x8u_lib4
4882#elif defined(OS_MAC)
4883 callq _inner_store_8x8u_lib4
4884#endif
4885#endif
4886
4887
4888 EPILOGUE
4889
4890 ret
4891
4892#if defined(OS_LINUX)
4893 .size kernel_dtrsm_nt_rl_inv_8x8u_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_lib4
4894#endif
4895
4896
4897
4898
4899
4900// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
4901// void kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
4902
4903 .p2align 4,,15
4904#if defined(OS_LINUX)
4905 .globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
4906 .type kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
4907kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
4908#elif defined(OS_MAC)
4909 .globl _kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
4910_kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
4911#elif defined(OS_WINDOWS)
4912 .globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
4913 .def kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
4914kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
4915#endif
4916
4917 PROLOGUE
4918
4919 // zero accumulation registers
4920
4921 vxorpd %ymm0, %ymm0, %ymm0
4922 vmovapd %ymm0, %ymm1
4923 vmovapd %ymm0, %ymm2
4924 vmovapd %ymm0, %ymm3
4925 vmovapd %ymm0, %ymm4
4926 vmovapd %ymm0, %ymm5
4927 vmovapd %ymm0, %ymm6
4928 vmovapd %ymm0, %ymm7
4929 vmovapd %ymm0, %ymm8
4930 vmovapd %ymm0, %ymm9
4931 vmovapd %ymm0, %ymm10
4932 vmovapd %ymm0, %ymm11
4933
4934
4935 // call inner dgemm kernel nt
4936
4937 movq ARG1, %r10
4938 movq ARG2, %r11
4939 movq ARG3, %r12
4940 sall $5, %r12d // 4*sda*sizeof(double)
4941 movq ARG4, %r13
4942 movq ARG5, %r14
4943 sall $5, %r14d // 4*sdb*sizeof(double)
4944
4945#if MACRO_LEVEL>=2
4946 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
4947#else
4948#if defined(OS_LINUX) | defined(OS_WINDOWS)
4949 call inner_kernel_dgemm_sub_nt_8x8_lib4
4950#elif defined(OS_MAC)
4951 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
4952#endif
4953#endif
4954
4955
4956 // call inner blender nn
4957
4958 movq ARG6, %r10
4959 movq ARG7, %r11 // sdc
4960 sall $5, %r11d // 4*sdc*sizeof(double)
4961
4962#if MACRO_LEVEL>=1
4963 INNER_SCALE_11_8X8_LIB4
4964#else
4965#if defined(OS_LINUX) | defined(OS_WINDOWS)
4966 call inner_scale_11_8x8_lib4
4967#elif defined(OS_MAC)
4968 callq _inner_scale_11_8x8_lib4
4969#endif
4970#endif
4971
4972
4973 // solve
4974
4975 movq ARG10, %r10 // E
4976 movq ARG11, %r11 // sde
4977 sall $5, %r11d // 4*sde*sizeof(double)
4978 movq ARG12, %r12 // inv_diag_E
4979 movq ARG14, %r13 // kn
4980
4981#if MACRO_LEVEL>=1
4982 INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
4983#else
4984#if defined(OS_LINUX) | defined(OS_WINDOWS)
4985 call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
4986#elif defined(OS_MAC)
4987 callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
4988#endif
4989#endif
4990
4991
4992 // store n
4993
4994 movq ARG8, %r10 // store address D
4995 movq ARG9, %r11 // sdd
4996 sall $5, %r11d // 4*sdd*sizeof(double)
4997
4998 movq ARG13, %r12 // km
4999 movq ARG14, %r13 // kn
5000
5001#if MACRO_LEVEL>=1
5002 INNER_STORE_8X8L_VS_LIB4
5003#else
5004#if defined(OS_LINUX) | defined(OS_WINDOWS)
5005 call inner_store_8x8l_vs_lib4
5006#elif defined(OS_MAC)
5007 callq _inner_store_8x8l_vs_lib4
5008#endif
5009#endif
5010
5011
5012 EPILOGUE
5013
5014 ret
5015
5016#if defined(OS_LINUX)
5017 .size kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
5018#endif
5019
5020
5021
5022
5023
5024// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
5025// void kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
5026
5027 .p2align 4,,15
5028#if defined(OS_LINUX)
5029 .globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
5030 .type kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
5031kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
5032#elif defined(OS_MAC)
5033 .globl _kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
5034_kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
5035#elif defined(OS_WINDOWS)
5036 .globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
5037 .def kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
5038kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
5039#endif
5040
5041 PROLOGUE
5042
5043 // zero accumulation registers
5044
5045 vxorpd %ymm0, %ymm0, %ymm0
5046 vmovapd %ymm0, %ymm1
5047 vmovapd %ymm0, %ymm2
5048 vmovapd %ymm0, %ymm3
5049 vmovapd %ymm0, %ymm4
5050 vmovapd %ymm0, %ymm5
5051 vmovapd %ymm0, %ymm6
5052 vmovapd %ymm0, %ymm7
5053 vmovapd %ymm0, %ymm8
5054 vmovapd %ymm0, %ymm9
5055 vmovapd %ymm0, %ymm10
5056 vmovapd %ymm0, %ymm11
5057
5058
5059 // call inner dgemm kernel nt
5060
5061 movq ARG1, %r10
5062 movq ARG4, %r11
5063 movq ARG5, %r12
5064 sall $5, %r12d // 4*sda*sizeof(double)
5065 movq ARG2, %r13
5066 movq ARG3, %r14
5067 sall $5, %r14d // 4*sdb*sizeof(double)
5068
5069#if MACRO_LEVEL>=2
5070 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
5071#else
5072#if defined(OS_LINUX) | defined(OS_WINDOWS)
5073 call inner_kernel_dgemm_sub_nt_8x8_lib4
5074#elif defined(OS_MAC)
5075 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
5076#endif
5077#endif
5078
5079
5080 // call inner blender nn
5081
5082 movq ARG6, %r10 // C
5083 movq ARG7, %r11 // sdc
5084 sall $5, %r11d // 4*sdc*sizeof(double)
5085
5086#if MACRO_LEVEL>=1
5087 INNER_TRAN_SCALE_11_8X8_LIB4
5088#else
5089#if defined(OS_LINUX) | defined(OS_WINDOWS)
5090 call inner_tran_scale_11_8x8_lib4
5091#elif defined(OS_MAC)
5092 callq _inner_tran_scale_11_8x8_lib4
5093#endif
5094#endif
5095
5096
5097 // solve
5098
5099 movq ARG10, %r10 // E
5100 movq ARG11, %r11 // sde
5101 sall $5, %r11d // 4*sde*sizeof(double)
5102 movq ARG12, %r12 // inv_diag_E
5103 movq ARG8, %r13 // D
5104 movq ARG9, %r14 // sdd
5105 sall $5, %r14d // 4*sdc*sizeof(double)
5106 movq ARG14, %r15 // kn
5107
5108#if MACRO_LEVEL>=1
5109 INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
5110#else
5111#if defined(OS_LINUX) | defined(OS_WINDOWS)
5112 call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
5113#elif defined(OS_MAC)
5114 callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
5115#endif
5116#endif
5117
5118
5119 // store n
5120
5121 movq ARG8, %r10 // store address D
5122 movq ARG9, %r11 // sdd
5123 sall $5, %r11d // 4*sdd*sizeof(double)
5124
5125 movq ARG13, %r12 // km
5126 movq ARG14, %r13 // kn
5127
5128#if MACRO_LEVEL>=1
5129 INNER_STORE_8X8U_VS_LIB4
5130#else
5131#if defined(OS_LINUX) | defined(OS_WINDOWS)
5132 call inner_store_8x8u_vs_lib4
5133#elif defined(OS_MAC)
5134 callq _inner_store_8x8u_vs_lib4
5135#endif
5136#endif
5137
5138
5139 EPILOGUE
5140
5141 ret
5142
5143#if defined(OS_LINUX)
5144 .size kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
5145#endif
5146
5147
5148
5149
5150
5151// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
5152// void kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
5153
5154 .p2align 4,,15
5155#if defined(OS_LINUX)
5156 .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
5157 .type kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
5158kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
5159#elif defined(OS_MAC)
5160 .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
5161_kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
5162#elif defined(OS_WINDOWS)
5163 .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
5164 .def kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
5165kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
5166#endif
5167
5168 PROLOGUE
5169
5170 // zero accumulation registers
5171
5172 vxorpd %ymm0, %ymm0, %ymm0
5173 vmovapd %ymm0, %ymm1
5174 vmovapd %ymm0, %ymm2
5175 vmovapd %ymm0, %ymm3
5176 vmovapd %ymm0, %ymm4
5177 vmovapd %ymm0, %ymm5
5178 vmovapd %ymm0, %ymm6
5179 vmovapd %ymm0, %ymm7
5180 vmovapd %ymm0, %ymm8
5181 vmovapd %ymm0, %ymm9
5182 vmovapd %ymm0, %ymm10
5183 vmovapd %ymm0, %ymm11
5184
5185
5186 // call inner dgemm kernel nt
5187
5188 movq ARG1, %r10
5189 movq ARG2, %r11
5190 movq ARG3, %r12
5191 sall $5, %r12d // 4*sda*sizeof(double)
5192 movq ARG4, %r13
5193 movq ARG5, %r14
5194 sall $5, %r14d // 4*sdb*sizeof(double)
5195
5196#if MACRO_LEVEL>=2
5197 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
5198#else
5199#if defined(OS_LINUX) | defined(OS_WINDOWS)
5200 call inner_kernel_dgemm_add_nt_8x8_lib4
5201#elif defined(OS_MAC)
5202 callq _inner_kernel_dgemm_add_nt_8x8_lib4
5203#endif
5204#endif
5205
5206
5207 movq ARG6, %r10
5208 movq ARG7, %r11
5209 movq ARG8, %r12
5210 sall $5, %r12d // 4*sda*sizeof(double)
5211 movq ARG9, %r13
5212 movq ARG10, %r14
5213 sall $5, %r14d // 4*sdb*sizeof(double)
5214
5215#if MACRO_LEVEL>=2
5216 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
5217#else
5218#if defined(OS_LINUX) | defined(OS_WINDOWS)
5219 call inner_kernel_dgemm_sub_nt_8x8_lib4
5220#elif defined(OS_MAC)
5221 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
5222#endif
5223#endif
5224
5225
5226 // call inner blender nn
5227
5228 movq ARG11, %r10
5229 movq ARG12, %r11 // sdc
5230 sall $5, %r11d // 4*sdc*sizeof(double)
5231
5232#if MACRO_LEVEL>=1
5233 INNER_SCALE_11_8X8_LIB4
5234#else
5235#if defined(OS_LINUX) | defined(OS_WINDOWS)
5236 call inner_scale_11_8x8_lib4
5237#elif defined(OS_MAC)
5238 callq _inner_scale_11_8x8_lib4
5239#endif
5240#endif
5241
5242
5243 // solve
5244
5245 movq ARG15, %r10 // E
5246 movq ARG16, %r11 // sde
5247 sall $5, %r11d // 4*sde*sizeof(double)
5248 movq ARG17, %r12 // inv_diag_E
5249 movq ARG19, %r13 // kn
5250
5251#if MACRO_LEVEL>=1
5252 INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
5253#else
5254#if defined(OS_LINUX) | defined(OS_WINDOWS)
5255 call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
5256#elif defined(OS_MAC)
5257 callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
5258#endif
5259#endif
5260
5261
5262 // store n
5263
5264 movq ARG13, %r10 // store address D
5265 movq ARG14, %r11 // sdd
5266 sall $5, %r11d // 4*sdd*sizeof(double)
5267
5268 movq ARG18, %r12 // km
5269 movq ARG19, %r13 // kn
5270
5271#if MACRO_LEVEL>=1
5272 INNER_STORE_8X8L_VS_LIB4
5273#else
5274#if defined(OS_LINUX) | defined(OS_WINDOWS)
5275 call inner_store_8x8l_vs_lib4
5276#elif defined(OS_MAC)
5277 callq _inner_store_8x8l_vs_lib4
5278#endif
5279#endif
5280
5281
5282 EPILOGUE
5283
5284 ret
5285
5286#if defined(OS_LINUX)
5287 .size kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
5288#endif
5289
5290
5291
5292
5293
5294// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
5295// void kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
5296
5297 .p2align 4,,15
5298#if defined(OS_LINUX)
5299 .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
5300 .type kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
5301kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
5302#elif defined(OS_MAC)
5303 .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
5304_kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
5305#elif defined(OS_WINDOWS)
5306 .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
5307 .def kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
5308kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
5309#endif
5310
5311 PROLOGUE
5312
5313 // zero accumulation registers
5314
5315 vxorpd %ymm0, %ymm0, %ymm0
5316 vmovapd %ymm0, %ymm1
5317 vmovapd %ymm0, %ymm2
5318 vmovapd %ymm0, %ymm3
5319 vmovapd %ymm0, %ymm4
5320 vmovapd %ymm0, %ymm5
5321 vmovapd %ymm0, %ymm6
5322 vmovapd %ymm0, %ymm7
5323 vmovapd %ymm0, %ymm8
5324 vmovapd %ymm0, %ymm9
5325 vmovapd %ymm0, %ymm10
5326 vmovapd %ymm0, %ymm11
5327
5328
5329 // call inner dgemm kernel nt
5330
5331 movq ARG1, %r10
5332 movq ARG4, %r11
5333 movq ARG5, %r12
5334 sall $5, %r12d // 4*sda*sizeof(double)
5335 movq ARG2, %r13
5336 movq ARG3, %r14
5337 sall $5, %r14d // 4*sdb*sizeof(double)
5338
5339#if MACRO_LEVEL>=2
5340 INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
5341#else
5342#if defined(OS_LINUX) | defined(OS_WINDOWS)
5343 call inner_kernel_dgemm_add_nt_8x8_lib4
5344#elif defined(OS_MAC)
5345 callq _inner_kernel_dgemm_add_nt_8x8_lib4
5346#endif
5347#endif
5348
5349
5350 movq ARG6, %r10
5351 movq ARG9, %r11
5352 movq ARG10, %r12
5353 sall $5, %r12d // 4*sda*sizeof(double)
5354 movq ARG7, %r13
5355 movq ARG8, %r14
5356 sall $5, %r14d // 4*sdb*sizeof(double)
5357
5358#if MACRO_LEVEL>=2
5359 INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
5360#else
5361#if defined(OS_LINUX) | defined(OS_WINDOWS)
5362 call inner_kernel_dgemm_sub_nt_8x8_lib4
5363#elif defined(OS_MAC)
5364 callq _inner_kernel_dgemm_sub_nt_8x8_lib4
5365#endif
5366#endif
5367
5368
5369 // call inner blender nn
5370
5371 movq ARG11, %r10 // C
5372 movq ARG12, %r11 // sdc
5373 sall $5, %r11d // 4*sdc*sizeof(double)
5374
5375#if MACRO_LEVEL>=1
5376 INNER_TRAN_SCALE_11_8X8_LIB4
5377#else
5378#if defined(OS_LINUX) | defined(OS_WINDOWS)
5379 call inner_tran_scale_11_8x8_lib4
5380#elif defined(OS_MAC)
5381 callq _inner_tran_scale_11_8x8_lib4
5382#endif
5383#endif
5384
5385
5386 // solve
5387
5388 movq ARG15, %r10 // E
5389 movq ARG16, %r11 // sde
5390 sall $5, %r11d // 4*sde*sizeof(double)
5391 movq ARG17, %r12 // inv_diag_E
5392 movq ARG13, %r13 // D
5393 movq ARG14, %r14 // sdd
5394 sall $5, %r14d // 4*sdc*sizeof(double)
5395 movq ARG19, %r15 // kn
5396
5397#if MACRO_LEVEL>=1
5398 INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
5399#else
5400#if defined(OS_LINUX) | defined(OS_WINDOWS)
5401 call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
5402#elif defined(OS_MAC)
5403 callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
5404#endif
5405#endif
5406
5407
5408 // store n
5409
5410 movq ARG13, %r10 // store address D
5411 movq ARG14, %r11 // sdd
5412 sall $5, %r11d // 4*sdd*sizeof(double)
5413
5414 movq ARG18, %r12 // km
5415 movq ARG19, %r13 // kn
5416
5417#if MACRO_LEVEL>=1
5418 INNER_STORE_8X8U_VS_LIB4
5419#else
5420#if defined(OS_LINUX) | defined(OS_WINDOWS)
5421 call inner_store_8x8u_vs_lib4
5422#elif defined(OS_MAC)
5423 callq _inner_store_8x8u_vs_lib4
5424#endif
5425#endif
5426
5427
5428 EPILOGUE
5429
5430 ret
5431
5432#if defined(OS_LINUX)
5433 .size kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
5434#endif
5435
5436
5437
5438
5439
5440 // read-only data
5441#if defined(OS_LINUX)
5442 .section .rodata.cst32,"aM",@progbits,32
5443#elif defined(OS_MAC)
5444 .section __TEXT,__const
5445#elif defined(OS_WINDOWS)
5446 .section .rdata,"dr"
5447#endif
5448
5449#if defined(OS_LINUX) | defined(OS_WINDOWS)
5450 .align 32
5451.LC00: // { -1 -1 -1 1 }
5452#elif defined(OS_MAC)
5453LC00: // { -1 -1 -1 1 }
5454 .align 5
5455#endif
5456 .quad -1
5457 .quad -1
5458 .quad -1
5459 .quad 1
5460
5461#if defined(OS_LINUX) | defined(OS_WINDOWS)
5462 .align 32
5463.LC01: // { -1 -1 -1 -1 }
5464#elif defined(OS_MAC)
5465LC01: // { -1 -1 -1 -1 }
5466 .align 5
5467#endif
5468 .quad -1
5469 .quad -1
5470 .quad -1
5471 .quad -1
5472
5473#if defined(OS_LINUX) | defined(OS_WINDOWS)
5474 .align 32
5475.LC02: // { 3.5 2.5 1.5 0.5 }
5476#elif defined(OS_MAC)
5477LC02: // { 3.5 2.5 1.5 0.5 }
5478 .align 5
5479#endif
5480 .long 0
5481 .long 1071644672
5482 .long 0
5483 .long 1073217536
5484 .long 0
5485 .long 1074003968
5486 .long 0
5487 .long 1074528256
5488
5489#if defined(OS_LINUX) | defined(OS_WINDOWS)
5490 .align 32
5491.LC03: // { 7.5 6.5 5.5 4.5 }
5492#elif defined(OS_MAC)
5493LC03: // { 7.5 6.5 5.5 4.5 }
5494 .align 5
5495#endif
5496 .long 0
5497 .long 1074921472
5498 .long 0
5499 .long 1075183616
5500 .long 0
5501 .long 1075445760
5502 .long 0
5503 .long 1075707904
5504
5505#if defined(OS_LINUX) | defined(OS_WINDOWS)
5506 .align 32
5507.LC04: // { 1.0 1.0 1.0 1.0 }
5508#elif defined(OS_MAC)
5509LC04: // { 1.0 1.0 1.0 1.0 }
5510 .align 5
5511#endif
5512 .long 0
5513 .long 1072693248
5514 .long 0
5515 .long 1072693248
5516 .long 0
5517 .long 1072693248
5518 .long 0
5519 .long 1072693248
5520
5521#if defined(OS_LINUX) | defined(OS_WINDOWS)
5522 .align 32
5523.LC05: // { 1.0 1.0 1.0 -1.0 }
5524#elif defined(OS_MAC)
5525 .align 5
5526LC05: // { 1.0 1.0 1.0 -1.0 }
5527#endif
5528 .long 0
5529 .long -1074790400
5530 .long 0
5531 .long 1072693248
5532 .long 0
5533 .long 1072693248
5534 .long 0
5535 .long 1072693248
5536
5537#if defined(OS_LINUX) | defined(OS_WINDOWS)
5538 .align 32
5539.LC06: // { 1.0 1.0 -1.0 -1.0 }
5540#elif defined(OS_MAC)
5541 .align 5
5542LC06: // { 1.0 1.0 -1.0 -1.0 }
5543#endif
5544 .long 0
5545 .long -1074790400
5546 .long 0
5547 .long -1074790400
5548 .long 0
5549 .long 1072693248
5550 .long 0
5551 .long 1072693248
5552
5553#if defined(OS_LINUX) | defined(OS_WINDOWS)
5554 .align 32
5555.LC07: // { 1.0 -1.0 -1.0 -1.0 }
5556#elif defined(OS_MAC)
5557 .align 5
5558LC07: // { 1.0 -1.0 -1.0 -1.0 }
5559#endif
5560 .long 0
5561 .long -1074790400
5562 .long 0
5563 .long -1074790400
5564 .long 0
5565 .long -1074790400
5566 .long 0
5567 .long 1072693248
5568
5569#if defined(OS_LINUX) | defined(OS_WINDOWS)
5570 .align 32
5571.LC08: // { -1.0 -1.0 -1.0 1.0 }
5572#elif defined(OS_MAC)
5573 .align 5
5574LC08: // { -1.0 -1.0 -1.0 1.0 }
5575#endif
5576 .long 0
5577 .long 1072693248
5578 .long 0
5579 .long -1074790400
5580 .long 0
5581 .long -1074790400
5582 .long 0
5583 .long -1074790400
5584
5585#if defined(OS_LINUX) | defined(OS_WINDOWS)
5586 .align 32
5587.LC09: // { -1.0 -1.0 1.0 1.0 }
5588#elif defined(OS_MAC)
5589 .align 5
5590LC09: // { -1.0 -1.0 1.0 1.0 }
5591#endif
5592 .long 0
5593 .long 1072693248
5594 .long 0
5595 .long 1072693248
5596 .long 0
5597 .long -1074790400
5598 .long 0
5599 .long -1074790400
5600
5601#if defined(OS_LINUX) | defined(OS_WINDOWS)
5602 .align 32
5603.LC10: // { -1.0 1.0 1.0 1.0 }
5604#elif defined(OS_MAC)
5605 .align 5
5606LC10: // { -1.0 1.0 1.0 1.0 }
5607#endif
5608 .long 0
5609 .long 1072693248
5610 .long 0
5611 .long 1072693248
5612 .long 0
5613 .long 1072693248
5614 .long 0
5615 .long -1074790400
5616
5617
5618
5619
5620#if defined(OS_LINUX)
5621 .section .note.GNU-stack,"",@progbits
5622#elif defined(OS_MAC)
5623 .subsections_via_symbols
5624#endif
5625