blob: 26f35b63b2fcf8fd7e1e21565481345db67b6c71 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#if defined(OS_LINUX) | defined(OS_MAC)
30
31//#define STACKSIZE 96
32#define STACKSIZE 64
33#define ARG1 %rdi
34#define ARG2 %rsi
35#define ARG3 %rdx
36#define ARG4 %rcx
37#define ARG5 %r8
38#define ARG6 %r9
39#define ARG7 STACKSIZE + 8(%rsp)
40#define ARG8 STACKSIZE + 16(%rsp)
41#define ARG9 STACKSIZE + 24(%rsp)
42#define ARG10 STACKSIZE + 32(%rsp)
43#define ARG11 STACKSIZE + 40(%rsp)
44#define ARG12 STACKSIZE + 48(%rsp)
45#define ARG13 STACKSIZE + 56(%rsp)
46#define ARG14 STACKSIZE + 64(%rsp)
47#define ARG15 STACKSIZE + 72(%rsp)
48#define ARG16 STACKSIZE + 80(%rsp)
49#define ARG17 STACKSIZE + 88(%rsp)
50#define ARG18 STACKSIZE + 96(%rsp)
51#define PROLOGUE \
52 subq $STACKSIZE, %rsp; \
53 movq %rbx, (%rsp); \
54 movq %rbp, 8(%rsp); \
55 movq %r12, 16(%rsp); \
56 movq %r13, 24(%rsp); \
57 movq %r14, 32(%rsp); \
58 movq %r15, 40(%rsp);
59#define EPILOGUE \
60 movq (%rsp), %rbx; \
61 movq 8(%rsp), %rbp; \
62 movq 16(%rsp), %r12; \
63 movq 24(%rsp), %r13; \
64 movq 32(%rsp), %r14; \
65 movq 40(%rsp), %r15; \
66 addq $STACKSIZE, %rsp;
67
68#elif defined(OS_WINDOWS)
69
70#define STACKSIZE 256
71#define ARG1 %rcx
72#define ARG2 %rdx
73#define ARG3 %r8
74#define ARG4 %r9
75#define ARG5 STACKSIZE + 40(%rsp)
76#define ARG6 STACKSIZE + 48(%rsp)
77#define ARG7 STACKSIZE + 56(%rsp)
78#define ARG8 STACKSIZE + 64(%rsp)
79#define ARG9 STACKSIZE + 72(%rsp)
80#define ARG10 STACKSIZE + 80(%rsp)
81#define ARG11 STACKSIZE + 88(%rsp)
82#define ARG12 STACKSIZE + 96(%rsp)
83#define ARG13 STACKSIZE + 104(%rsp)
84#define ARG14 STACKSIZE + 112(%rsp)
85#define ARG15 STACKSIZE + 120(%rsp)
86#define ARG16 STACKSIZE + 128(%rsp)
87#define ARG17 STACKSIZE + 136(%rsp)
88#define ARG18 STACKSIZE + 144(%rsp)
89#define PROLOGUE \
90 subq $STACKSIZE, %rsp; \
91 movq %rbx, (%rsp); \
92 movq %rbp, 8(%rsp); \
93 movq %r12, 16(%rsp); \
94 movq %r13, 24(%rsp); \
95 movq %r14, 32(%rsp); \
96 movq %r15, 40(%rsp); \
97 movq %rdi, 48(%rsp); \
98 movq %rsi, 56(%rsp); \
99 vmovups %xmm6, 64(%rsp); \
100 vmovups %xmm7, 80(%rsp); \
101 vmovups %xmm8, 96(%rsp); \
102 vmovups %xmm9, 112(%rsp); \
103 vmovups %xmm10, 128(%rsp); \
104 vmovups %xmm11, 144(%rsp); \
105 vmovups %xmm12, 160(%rsp); \
106 vmovups %xmm13, 176(%rsp); \
107 vmovups %xmm14, 192(%rsp); \
108 vmovups %xmm15, 208(%rsp);
109#define EPILOGUE \
110 movq (%rsp), %rbx; \
111 movq 8(%rsp), %rbp; \
112 movq 16(%rsp), %r12; \
113 movq 24(%rsp), %r13; \
114 movq 32(%rsp), %r14; \
115 movq 40(%rsp), %r15; \
116 movq 48(%rsp), %rdi; \
117 movq 56(%rsp), %rsi; \
118 vmovups 64(%rsp), %xmm6; \
119 vmovups 80(%rsp), %xmm7; \
120 vmovups 96(%rsp), %xmm8; \
121 vmovups 112(%rsp), %xmm9; \
122 vmovups 128(%rsp), %xmm10; \
123 vmovups 144(%rsp), %xmm11; \
124 vmovups 160(%rsp), %xmm12; \
125 vmovups 176(%rsp), %xmm13; \
126 vmovups 192(%rsp), %xmm14; \
127 vmovups 208(%rsp), %xmm15; \
128 addq $STACKSIZE, %rsp;
129
130#else
131
132#error wrong OS
133
134#endif
135
136
137
138#if defined(OS_LINUX) | defined(OS_WINDOWS)
139 .text
140#elif defined(OS_MAC)
141 .section __TEXT,__text,regular,pure_instructions
142#endif
143
144
145
146// common inner routine with file scope
147//
148// input arguments:
149// r10d <- k
150// r11 <- A
151// r12 <- B
152// xmm0 <- [d00 d10]
153// xmm1 <- [d20 d30]
154// xmm2 <- [d01 d11]
155// xmm3 <- [d21 d31]
156// xmm0 <- [d02 d12]
157// xmm1 <- [d22 d32]
158// xmm2 <- [d03 d13]
159// xmm3 <- [d23 d33]
160// xmm8 <- dirty
161// xmm9 <- dirty
162// xmm10 <- dirty
163// xmm11 <- dirty
164// xmm12 <- dirty
165// xmm13 <- dirty
166// xmm14 <- dirty
167// xmm15 <- dirty
168
169//
170// output arguments:
171// r10d <- 0
172// r11 <- A+4*k*sizeof(double)
173// r12 <- B+4*k*sizeof(double)
174// xmm0 <- [d00 d10]
175// xmm1 <- [d20 d30]
176// xmm2 <- [d01 d11]
177// xmm3 <- [d21 d31]
178// xmm0 <- [d02 d12]
179// xmm1 <- [d22 d32]
180// xmm2 <- [d03 d13]
181// xmm3 <- [d23 d33]
182// xmm8 <- dirty
183// xmm9 <- dirty
184// xmm10 <- dirty
185// xmm11 <- dirty
186// xmm12 <- dirty
187// xmm13 <- dirty
188// xmm14 <- dirty
189// xmm15 <- dirty
190
191#if MACRO_LEVEL>=2
192 .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
193#else
194 .p2align 4,,15
195#if defined(OS_LINUX)
196 .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
197inner_kernel_dgemm_add_nt_4x4_lib4:
198#elif defined(OS_MAC)
199_inner_kernel_dgemm_add_nt_4x4_lib4:
200#elif defined(OS_WINDOWS)
201 .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
202inner_kernel_dgemm_add_nt_4x4_lib4:
203#endif
204#endif
205
206 cmpl $0, %r10d
207 jle 2f // return
208
209 // prefetch
210 movapd 0(%r11), %xmm8 // A[0]
211 movapd 16(%r11), %xmm9 // A[2]
212 movapd 0(%r12), %xmm10 // B[0]
213
214 xorpd %xmm11, %xmm11
215 movapd %xmm11, %xmm12
216 movapd %xmm11, %xmm13
217 movapd %xmm11, %xmm14
218 movapd %xmm11, %xmm15
219
220
221 cmpl $4, %r10d
222 jle 0f // consider clean-up loop
223
224 // main loop
225 .p2align 3
2261: // main loop
227
228 // unroll 0
229 addpd %xmm14, %xmm3
230 movapd 16(%r12), %xmm14 // B[2]
231 addpd %xmm11, %xmm7
232 movapd %xmm10, %xmm11
233 pshufd $0x4e, %xmm10, %xmm15
234 mulpd %xmm8, %xmm10
235 mulpd %xmm9, %xmm11
236
237 addpd %xmm12, %xmm2
238 addpd %xmm13, %xmm6
239 movapd %xmm15, %xmm13
240 mulpd %xmm8, %xmm15
241 mulpd %xmm9, %xmm13
242
243 addpd %xmm10, %xmm1
244 movapd 32(%r12), %xmm10 // B[4]
245 addpd %xmm11, %xmm5
246 movapd %xmm14, %xmm11
247 pshufd $0x4e, %xmm14, %xmm12
248 mulpd %xmm8, %xmm14
249 mulpd %xmm9, %xmm11
250
251 addpd %xmm15, %xmm0
252 addpd %xmm13, %xmm4
253 movapd %xmm12, %xmm13
254 mulpd %xmm8, %xmm12
255 movapd 32(%r11), %xmm8 // A[4]
256 mulpd %xmm9, %xmm13
257 movapd 48(%r11), %xmm9 // A[6]
258
259
260 // unroll 1
261 addpd %xmm14, %xmm3
262 movapd 48(%r12), %xmm14 // B[6]
263 addpd %xmm11, %xmm7
264 movapd %xmm10, %xmm11
265 pshufd $0x4e, %xmm10, %xmm15
266 mulpd %xmm8, %xmm10
267 mulpd %xmm9, %xmm11
268
269 addpd %xmm12, %xmm2
270 addpd %xmm13, %xmm6
271 movapd %xmm15, %xmm13
272 mulpd %xmm8, %xmm15
273 mulpd %xmm9, %xmm13
274
275 addpd %xmm10, %xmm1
276 movapd 64(%r12), %xmm10 // B[8]
277 addpd %xmm11, %xmm5
278 movapd %xmm14, %xmm11
279 pshufd $0x4e, %xmm14, %xmm12
280 mulpd %xmm8, %xmm14
281 mulpd %xmm9, %xmm11
282
283 addpd %xmm15, %xmm0
284 addpd %xmm13, %xmm4
285 movapd %xmm12, %xmm13
286 mulpd %xmm8, %xmm12
287 movapd 64(%r11), %xmm8 // A[8]
288 mulpd %xmm9, %xmm13
289 movapd 80(%r11), %xmm9 // A[10]
290
291
292 // unroll 2
293 addpd %xmm14, %xmm3
294 movapd 80(%r12), %xmm14 // B[10]
295 addpd %xmm11, %xmm7
296 movapd %xmm10, %xmm11
297 pshufd $0x4e, %xmm10, %xmm15
298 mulpd %xmm8, %xmm10
299 mulpd %xmm9, %xmm11
300 subl $4, %r10d
301
302 addpd %xmm12, %xmm2
303 addpd %xmm13, %xmm6
304 movapd %xmm15, %xmm13
305 mulpd %xmm8, %xmm15
306 mulpd %xmm9, %xmm13
307
308 addpd %xmm10, %xmm1
309 movapd 96(%r12), %xmm10 // B[12]
310 addpd %xmm11, %xmm5
311 movapd %xmm14, %xmm11
312 pshufd $0x4e, %xmm14, %xmm12
313 mulpd %xmm8, %xmm14
314 mulpd %xmm9, %xmm11
315
316 addpd %xmm15, %xmm0
317 addpd %xmm13, %xmm4
318 movapd %xmm12, %xmm13
319 mulpd %xmm8, %xmm12
320 movapd 96(%r11), %xmm8 // A[12]
321 mulpd %xmm9, %xmm13
322 movapd 112(%r11), %xmm9 // A[14]
323
324
325 // unroll 3
326 addpd %xmm14, %xmm3
327 movapd 112(%r12), %xmm14 // B[14]
328 addpd %xmm11, %xmm7
329 movapd %xmm10, %xmm11
330 pshufd $0x4e, %xmm10, %xmm15
331 mulpd %xmm8, %xmm10
332 mulpd %xmm9, %xmm11
333 addq $128, %r12 // B += 16
334
335 addpd %xmm12, %xmm2
336 addpd %xmm13, %xmm6
337 movapd %xmm15, %xmm13
338 mulpd %xmm8, %xmm15
339 mulpd %xmm9, %xmm13
340 addq $128, %r11 // A += 16
341
342 addpd %xmm10, %xmm1
343 movapd 0(%r12), %xmm10 // B[0]
344 addpd %xmm11, %xmm5
345 movapd %xmm14, %xmm11
346 pshufd $0x4e, %xmm14, %xmm12
347 mulpd %xmm8, %xmm14
348 mulpd %xmm9, %xmm11
349 cmpl $4, %r10d
350
351 addpd %xmm15, %xmm0
352 addpd %xmm13, %xmm4
353 movapd %xmm12, %xmm13
354 mulpd %xmm8, %xmm12
355 movapd 0(%r11), %xmm8 // A[0]
356 mulpd %xmm9, %xmm13
357 movapd 16(%r11), %xmm9 // A[2]
358
359
360 jg 1b // main loop
361
362
3630: // consider clean4-up
364
365 cmpl $3, %r10d
366 jle 4f // clean1
367
368
369 // unroll 0
370 addpd %xmm14, %xmm3
371 movapd 16(%r12), %xmm14 // B[2]
372 addpd %xmm11, %xmm7
373 movapd %xmm10, %xmm11
374 pshufd $0x4e, %xmm10, %xmm15
375 mulpd %xmm8, %xmm10
376 mulpd %xmm9, %xmm11
377
378 addpd %xmm12, %xmm2
379 addpd %xmm13, %xmm6
380 movapd %xmm15, %xmm13
381 mulpd %xmm8, %xmm15
382 mulpd %xmm9, %xmm13
383
384 addpd %xmm10, %xmm1
385 movapd 32(%r12), %xmm10 // B[4]
386 addpd %xmm11, %xmm5
387 movapd %xmm14, %xmm11
388 pshufd $0x4e, %xmm14, %xmm12
389 mulpd %xmm8, %xmm14
390 mulpd %xmm9, %xmm11
391
392 addpd %xmm15, %xmm0
393 addpd %xmm13, %xmm4
394 movapd %xmm12, %xmm13
395 mulpd %xmm8, %xmm12
396 movapd 32(%r11), %xmm8 // A[4]
397 mulpd %xmm9, %xmm13
398 movapd 48(%r11), %xmm9 // A[6]
399
400
401 // unroll 1
402 addpd %xmm14, %xmm3
403 movapd 48(%r12), %xmm14 // B[6]
404 addpd %xmm11, %xmm7
405 movapd %xmm10, %xmm11
406 pshufd $0x4e, %xmm10, %xmm15
407 mulpd %xmm8, %xmm10
408 mulpd %xmm9, %xmm11
409
410 addpd %xmm12, %xmm2
411 addpd %xmm13, %xmm6
412 movapd %xmm15, %xmm13
413 mulpd %xmm8, %xmm15
414 mulpd %xmm9, %xmm13
415
416 addpd %xmm10, %xmm1
417 movapd 64(%r12), %xmm10 // B[8]
418 addpd %xmm11, %xmm5
419 movapd %xmm14, %xmm11
420 pshufd $0x4e, %xmm14, %xmm12
421 mulpd %xmm8, %xmm14
422 mulpd %xmm9, %xmm11
423
424 addpd %xmm15, %xmm0
425 addpd %xmm13, %xmm4
426 movapd %xmm12, %xmm13
427 mulpd %xmm8, %xmm12
428 movapd 64(%r11), %xmm8 // A[8]
429 mulpd %xmm9, %xmm13
430 movapd 80(%r11), %xmm9 // A[10]
431
432
433 // unroll 2
434 addpd %xmm14, %xmm3
435 movapd 80(%r12), %xmm14 // B[10]
436 addpd %xmm11, %xmm7
437 movapd %xmm10, %xmm11
438 pshufd $0x4e, %xmm10, %xmm15
439 mulpd %xmm8, %xmm10
440 mulpd %xmm9, %xmm11
441 subl $4, %r10d
442
443 addpd %xmm12, %xmm2
444 addpd %xmm13, %xmm6
445 movapd %xmm15, %xmm13
446 mulpd %xmm8, %xmm15
447 mulpd %xmm9, %xmm13
448
449 addpd %xmm10, %xmm1
450 movapd 96(%r12), %xmm10 // B[12]
451 addpd %xmm11, %xmm5
452 movapd %xmm14, %xmm11
453 pshufd $0x4e, %xmm14, %xmm12
454 mulpd %xmm8, %xmm14
455 mulpd %xmm9, %xmm11
456
457 addpd %xmm15, %xmm0
458 addpd %xmm13, %xmm4
459 movapd %xmm12, %xmm13
460 mulpd %xmm8, %xmm12
461 movapd 96(%r11), %xmm8 // A[12]
462 mulpd %xmm9, %xmm13
463 movapd 112(%r11), %xmm9 // A[14]
464
465
466 // unroll 3
467 addpd %xmm14, %xmm3
468 movapd 112(%r12), %xmm14 // B[14]
469 addpd %xmm11, %xmm7
470 movapd %xmm10, %xmm11
471 pshufd $0x4e, %xmm10, %xmm15
472 mulpd %xmm8, %xmm10
473 mulpd %xmm9, %xmm11
474 addq $128, %r12 // B += 16
475
476 addpd %xmm12, %xmm2
477 addpd %xmm13, %xmm6
478 movapd %xmm15, %xmm13
479 mulpd %xmm8, %xmm15
480 mulpd %xmm9, %xmm13
481 addq $128, %r11 // A += 16
482
483 addpd %xmm10, %xmm1
484// movapd 0(%r12), %xmm10 // B[0]
485 addpd %xmm11, %xmm5
486 movapd %xmm14, %xmm11
487 pshufd $0x4e, %xmm14, %xmm12
488 mulpd %xmm8, %xmm14
489 mulpd %xmm9, %xmm11
490// cmpl $4, %r10d
491
492 addpd %xmm15, %xmm0
493 addpd %xmm13, %xmm4
494 movapd %xmm12, %xmm13
495 mulpd %xmm8, %xmm12
496// movapd 0(%r11), %xmm8 // A[0]
497 mulpd %xmm9, %xmm13
498// movapd 16(%r11), %xmm9 // A[2]
499
500
501 // clean accumulators
502 addpd %xmm14, %xmm3
503 addpd %xmm11, %xmm7
504 addpd %xmm12, %xmm2
505 addpd %xmm13, %xmm6
506
507
508 jmp 2f
509
510
5114: // consider clean1-up loop
512
513 cmpl $0, %r10d
514 jle 2f // return
515
516 // clean-up loop
5173: // clean up loop
518
519
520 // unroll 0
521 addpd %xmm14, %xmm3
522 movapd 16(%r12), %xmm14 // B[2]
523 addpd %xmm11, %xmm7
524 movapd %xmm10, %xmm11
525 pshufd $0x4e, %xmm10, %xmm15
526 mulpd %xmm8, %xmm10
527 mulpd %xmm9, %xmm11
528 subl $1, %r10d
529
530 addpd %xmm12, %xmm2
531 addpd %xmm13, %xmm6
532 movapd %xmm15, %xmm13
533 mulpd %xmm8, %xmm15
534 mulpd %xmm9, %xmm13
535 addq $32, %r12
536
537 addpd %xmm10, %xmm1
538 movapd 32(%r12), %xmm10 // B[0]
539 addpd %xmm11, %xmm5
540 movapd %xmm14, %xmm11
541 pshufd $0x4e, %xmm14, %xmm12
542 mulpd %xmm8, %xmm14
543 mulpd %xmm9, %xmm11
544 addq $32, %r11
545
546 addpd %xmm15, %xmm0
547 addpd %xmm13, %xmm4
548 movapd %xmm12, %xmm13
549 mulpd %xmm8, %xmm12
550 movapd 32(%r11), %xmm8 // A[0]
551 mulpd %xmm9, %xmm13
552 movapd 48(%r11), %xmm9 // A[2]
553
554 cmpl $0, %r10d
555
556 jg 3b // clean up loop
557
558
559 // clean accumulators
560 addpd %xmm14, %xmm3
561 addpd %xmm11, %xmm7
562 addpd %xmm12, %xmm2
563 addpd %xmm13, %xmm6
564
565
5662: // return
567
568#if MACRO_LEVEL>=2
569 .endm
570#else
571 ret
572
573#if defined(OS_LINUX)
574 .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
575#endif
576#endif
577
578
579
580
581
582// common inner routine with file scope
583//
584// input arguments:
585// r10d <- k
586// r11 <- A
587// r12 <- B
588// xmm0 <- [d00 d10]
589// xmm1 <- [d20 d30]
590// xmm2 <- [d01 d11]
591// xmm3 <- [d21 d31]
592// xmm0 <- [d02 d12]
593// xmm1 <- [d22 d32]
594// xmm2 <- [d03 d13]
595// xmm3 <- [d23 d33]
596// xmm8 <- dirty
597// xmm9 <- dirty
598// xmm10 <- dirty
599// xmm11 <- dirty
600// xmm12 <- dirty
601// xmm13 <- dirty
602// xmm14 <- dirty
603// xmm15 <- dirty
604
605//
606// output arguments:
607// r10d <- 0
608// r11 <- A+4*k*sizeof(double)
609// r12 <- B+4*k*sizeof(double)
610// xmm0 <- [d00 d10]
611// xmm1 <- [d20 d30]
612// xmm2 <- [d01 d11]
613// xmm3 <- [d21 d31]
614// xmm0 <- [d02 d12]
615// xmm1 <- [d22 d32]
616// xmm2 <- [d03 d13]
617// xmm3 <- [d23 d33]
618// xmm8 <- dirty
619// xmm9 <- dirty
620// xmm10 <- dirty
621// xmm11 <- dirty
622// xmm12 <- dirty
623// xmm13 <- dirty
624// xmm14 <- dirty
625// xmm15 <- dirty
626
627#if MACRO_LEVEL>=2
628 .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
629#else
630 .p2align 4,,15
631#if defined(OS_LINUX)
632 .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
633inner_kernel_dgemm_sub_nt_4x4_lib4:
634#elif defined(OS_MAC)
635_inner_kernel_dgemm_sub_nt_4x4_lib4:
636#elif defined(OS_WINDOWS)
637 .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
638inner_kernel_dgemm_sub_nt_4x4_lib4:
639#endif
640#endif
641
642 cmpl $0, %r10d
643 jle 2f // return
644
645 // prefetch
646 movapd 0(%r11), %xmm8 // A[0]
647 movapd 16(%r11), %xmm9 // A[2]
648 movapd 0(%r12), %xmm10 // B[0]
649
650 xorpd %xmm11, %xmm11
651 movapd %xmm11, %xmm12
652 movapd %xmm11, %xmm13
653 movapd %xmm11, %xmm14
654 movapd %xmm11, %xmm15
655
656 cmpl $4, %r10d
657 jle 0f // consider clean-up loop
658
659 // main loop
660 .p2align 3
6611: // main loop
662
663 // unroll 0
664 subpd %xmm14, %xmm3
665 movapd 16(%r12), %xmm14 // B[2]
666 subpd %xmm11, %xmm7
667 movapd %xmm10, %xmm11
668 pshufd $0x4e, %xmm10, %xmm15
669 mulpd %xmm8, %xmm10
670 mulpd %xmm9, %xmm11
671
672 subpd %xmm12, %xmm2
673 subpd %xmm13, %xmm6
674 movapd %xmm15, %xmm13
675 mulpd %xmm8, %xmm15
676 mulpd %xmm9, %xmm13
677
678 subpd %xmm10, %xmm1
679 movapd 32(%r12), %xmm10 // B[4]
680 subpd %xmm11, %xmm5
681 movapd %xmm14, %xmm11
682 pshufd $0x4e, %xmm14, %xmm12
683 mulpd %xmm8, %xmm14
684 mulpd %xmm9, %xmm11
685
686 subpd %xmm15, %xmm0
687 subpd %xmm13, %xmm4
688 movapd %xmm12, %xmm13
689 mulpd %xmm8, %xmm12
690 movapd 32(%r11), %xmm8 // A[4]
691 mulpd %xmm9, %xmm13
692 movapd 48(%r11), %xmm9 // A[6]
693
694
695 // unroll 1
696 subpd %xmm14, %xmm3
697 movapd 48(%r12), %xmm14 // B[6]
698 subpd %xmm11, %xmm7
699 movapd %xmm10, %xmm11
700 pshufd $0x4e, %xmm10, %xmm15
701 mulpd %xmm8, %xmm10
702 mulpd %xmm9, %xmm11
703
704 subpd %xmm12, %xmm2
705 subpd %xmm13, %xmm6
706 movapd %xmm15, %xmm13
707 mulpd %xmm8, %xmm15
708 mulpd %xmm9, %xmm13
709
710 subpd %xmm10, %xmm1
711 movapd 64(%r12), %xmm10 // B[8]
712 subpd %xmm11, %xmm5
713 movapd %xmm14, %xmm11
714 pshufd $0x4e, %xmm14, %xmm12
715 mulpd %xmm8, %xmm14
716 mulpd %xmm9, %xmm11
717
718 subpd %xmm15, %xmm0
719 subpd %xmm13, %xmm4
720 movapd %xmm12, %xmm13
721 mulpd %xmm8, %xmm12
722 movapd 64(%r11), %xmm8 // A[8]
723 mulpd %xmm9, %xmm13
724 movapd 80(%r11), %xmm9 // A[10]
725
726
727 // unroll 2
728 subpd %xmm14, %xmm3
729 movapd 80(%r12), %xmm14 // B[10]
730 subpd %xmm11, %xmm7
731 movapd %xmm10, %xmm11
732 pshufd $0x4e, %xmm10, %xmm15
733 mulpd %xmm8, %xmm10
734 mulpd %xmm9, %xmm11
735 subl $4, %r10d
736
737 subpd %xmm12, %xmm2
738 subpd %xmm13, %xmm6
739 movapd %xmm15, %xmm13
740 mulpd %xmm8, %xmm15
741 mulpd %xmm9, %xmm13
742
743 subpd %xmm10, %xmm1
744 movapd 96(%r12), %xmm10 // B[12]
745 subpd %xmm11, %xmm5
746 movapd %xmm14, %xmm11
747 pshufd $0x4e, %xmm14, %xmm12
748 mulpd %xmm8, %xmm14
749 mulpd %xmm9, %xmm11
750
751 subpd %xmm15, %xmm0
752 subpd %xmm13, %xmm4
753 movapd %xmm12, %xmm13
754 mulpd %xmm8, %xmm12
755 movapd 96(%r11), %xmm8 // A[12]
756 mulpd %xmm9, %xmm13
757 movapd 112(%r11), %xmm9 // A[14]
758
759
760 // unroll 3
761 subpd %xmm14, %xmm3
762 movapd 112(%r12), %xmm14 // B[14]
763 subpd %xmm11, %xmm7
764 movapd %xmm10, %xmm11
765 pshufd $0x4e, %xmm10, %xmm15
766 mulpd %xmm8, %xmm10
767 mulpd %xmm9, %xmm11
768 addq $128, %r12 // B += 16
769
770 subpd %xmm12, %xmm2
771 subpd %xmm13, %xmm6
772 movapd %xmm15, %xmm13
773 mulpd %xmm8, %xmm15
774 mulpd %xmm9, %xmm13
775 addq $128, %r11 // A += 16
776
777 subpd %xmm10, %xmm1
778 movapd 0(%r12), %xmm10 // B[0]
779 subpd %xmm11, %xmm5
780 movapd %xmm14, %xmm11
781 pshufd $0x4e, %xmm14, %xmm12
782 mulpd %xmm8, %xmm14
783 mulpd %xmm9, %xmm11
784 cmpl $4, %r10d
785
786 subpd %xmm15, %xmm0
787 subpd %xmm13, %xmm4
788 movapd %xmm12, %xmm13
789 mulpd %xmm8, %xmm12
790 movapd 0(%r11), %xmm8 // A[0]
791 mulpd %xmm9, %xmm13
792 movapd 16(%r11), %xmm9 // A[2]
793
794
795 jg 1b // main loop
796
797
7980: // consider clean4-up
799
800 cmpl $3, %r10d
801 jle 4f // clean1
802
803
804 // unroll 0
805 subpd %xmm14, %xmm3
806 movapd 16(%r12), %xmm14 // B[2]
807 subpd %xmm11, %xmm7
808 movapd %xmm10, %xmm11
809 pshufd $0x4e, %xmm10, %xmm15
810 mulpd %xmm8, %xmm10
811 mulpd %xmm9, %xmm11
812
813 subpd %xmm12, %xmm2
814 subpd %xmm13, %xmm6
815 movapd %xmm15, %xmm13
816 mulpd %xmm8, %xmm15
817 mulpd %xmm9, %xmm13
818
819 subpd %xmm10, %xmm1
820 movapd 32(%r12), %xmm10 // B[4]
821 subpd %xmm11, %xmm5
822 movapd %xmm14, %xmm11
823 pshufd $0x4e, %xmm14, %xmm12
824 mulpd %xmm8, %xmm14
825 mulpd %xmm9, %xmm11
826
827 subpd %xmm15, %xmm0
828 subpd %xmm13, %xmm4
829 movapd %xmm12, %xmm13
830 mulpd %xmm8, %xmm12
831 movapd 32(%r11), %xmm8 // A[4]
832 mulpd %xmm9, %xmm13
833 movapd 48(%r11), %xmm9 // A[6]
834
835
836 // unroll 1
837 subpd %xmm14, %xmm3
838 movapd 48(%r12), %xmm14 // B[6]
839 subpd %xmm11, %xmm7
840 movapd %xmm10, %xmm11
841 pshufd $0x4e, %xmm10, %xmm15
842 mulpd %xmm8, %xmm10
843 mulpd %xmm9, %xmm11
844
845 subpd %xmm12, %xmm2
846 subpd %xmm13, %xmm6
847 movapd %xmm15, %xmm13
848 mulpd %xmm8, %xmm15
849 mulpd %xmm9, %xmm13
850
851 subpd %xmm10, %xmm1
852 movapd 64(%r12), %xmm10 // B[8]
853 subpd %xmm11, %xmm5
854 movapd %xmm14, %xmm11
855 pshufd $0x4e, %xmm14, %xmm12
856 mulpd %xmm8, %xmm14
857 mulpd %xmm9, %xmm11
858
859 subpd %xmm15, %xmm0
860 subpd %xmm13, %xmm4
861 movapd %xmm12, %xmm13
862 mulpd %xmm8, %xmm12
863 movapd 64(%r11), %xmm8 // A[8]
864 mulpd %xmm9, %xmm13
865 movapd 80(%r11), %xmm9 // A[10]
866
867
868 // unroll 2
869 subpd %xmm14, %xmm3
870 movapd 80(%r12), %xmm14 // B[10]
871 subpd %xmm11, %xmm7
872 movapd %xmm10, %xmm11
873 pshufd $0x4e, %xmm10, %xmm15
874 mulpd %xmm8, %xmm10
875 mulpd %xmm9, %xmm11
876 subl $4, %r10d
877
878 subpd %xmm12, %xmm2
879 subpd %xmm13, %xmm6
880 movapd %xmm15, %xmm13
881 mulpd %xmm8, %xmm15
882 mulpd %xmm9, %xmm13
883
884 subpd %xmm10, %xmm1
885 movapd 96(%r12), %xmm10 // B[12]
886 subpd %xmm11, %xmm5
887 movapd %xmm14, %xmm11
888 pshufd $0x4e, %xmm14, %xmm12
889 mulpd %xmm8, %xmm14
890 mulpd %xmm9, %xmm11
891
892 subpd %xmm15, %xmm0
893 subpd %xmm13, %xmm4
894 movapd %xmm12, %xmm13
895 mulpd %xmm8, %xmm12
896 movapd 96(%r11), %xmm8 // A[12]
897 mulpd %xmm9, %xmm13
898 movapd 112(%r11), %xmm9 // A[14]
899
900
901 // unroll 3
902 subpd %xmm14, %xmm3
903 movapd 112(%r12), %xmm14 // B[14]
904 subpd %xmm11, %xmm7
905 movapd %xmm10, %xmm11
906 pshufd $0x4e, %xmm10, %xmm15
907 mulpd %xmm8, %xmm10
908 mulpd %xmm9, %xmm11
909 addq $128, %r12 // B += 16
910
911 subpd %xmm12, %xmm2
912 subpd %xmm13, %xmm6
913 movapd %xmm15, %xmm13
914 mulpd %xmm8, %xmm15
915 mulpd %xmm9, %xmm13
916 addq $128, %r11 // A += 16
917
918 subpd %xmm10, %xmm1
919// movapd 0(%r12), %xmm10 // B[0]
920 subpd %xmm11, %xmm5
921 movapd %xmm14, %xmm11
922 pshufd $0x4e, %xmm14, %xmm12
923 mulpd %xmm8, %xmm14
924 mulpd %xmm9, %xmm11
925// cmpl $4, %r10d
926
927 subpd %xmm15, %xmm0
928 subpd %xmm13, %xmm4
929 movapd %xmm12, %xmm13
930 mulpd %xmm8, %xmm12
931// movapd 0(%r11), %xmm8 // A[0]
932 mulpd %xmm9, %xmm13
933// movapd 16(%r11), %xmm9 // A[2]
934
935
936 // update accumulators
937 subpd %xmm14, %xmm3
938 subpd %xmm11, %xmm7
939 subpd %xmm12, %xmm2
940 subpd %xmm13, %xmm6
941
942
943 jmp 2f
944
945
9464: // consider clean1-up loop
947
948 cmpl $0, %r10d
949 jle 2f // return
950
951 // clean-up loop
9523: // clean up loop
953
954
955 // unroll 0
956 subpd %xmm14, %xmm3
957 movapd 16(%r12), %xmm14 // B[2]
958 subpd %xmm11, %xmm7
959 movapd %xmm10, %xmm11
960 pshufd $0x4e, %xmm10, %xmm15
961 mulpd %xmm8, %xmm10
962 mulpd %xmm9, %xmm11
963 subl $1, %r10d
964
965 subpd %xmm12, %xmm2
966 subpd %xmm13, %xmm6
967 movapd %xmm15, %xmm13
968 mulpd %xmm8, %xmm15
969 mulpd %xmm9, %xmm13
970 addq $32, %r12
971
972 subpd %xmm10, %xmm1
973 movapd 32(%r12), %xmm10 // B[0]
974 subpd %xmm11, %xmm5
975 movapd %xmm14, %xmm11
976 pshufd $0x4e, %xmm14, %xmm12
977 mulpd %xmm8, %xmm14
978 mulpd %xmm9, %xmm11
979 addq $32, %r11
980
981 subpd %xmm15, %xmm0
982 subpd %xmm13, %xmm4
983 movapd %xmm12, %xmm13
984 mulpd %xmm8, %xmm12
985 movapd 32(%r11), %xmm8 // A[0]
986 mulpd %xmm9, %xmm13
987 movapd 48(%r11), %xmm9 // A[2]
988
989 cmpl $0, %r10d
990
991 jg 3b // clean up loop
992
993
994 // update accumulators
995 subpd %xmm14, %xmm3
996 subpd %xmm11, %xmm7
997 subpd %xmm12, %xmm2
998 subpd %xmm13, %xmm6
999
1000
10012: // return
1002
1003#if MACRO_LEVEL>=2
1004 .endm
1005#else
1006 ret
1007
1008#if defined(OS_LINUX)
1009 .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
1010#endif
1011#endif
1012
1013
1014
1015
1016
1017// common inner routine with file scope
1018//
1019// input arguments:
1020// r10d <- k
1021// r11 <- A
1022// r12 <- B
1023// r13 <- 4*sdb*sizeof(double)
1024// xmm0 <- [d00 d10]
1025// xmm1 <- [d01 d11]
1026// xmm2 <- [d02 d12]
1027// xmm3 <- [d03 d13]
1028// xmm4 <- [d20 d30]
1029// xmm5 <- [d21 d31]
1030// xmm6 <- [d22 d32]
1031// xmm7 <- [d23 d33]
1032// xmm8 <- dirty
1033// xmm9 <- dirty
1034// xmm10 <- dirty
1035// xmm11 <- dirty
1036// xmm12 <- dirty
1037// xmm13 <- dirty
1038// xmm14 <- dirty
1039// xmm15 <- dirty
1040
1041//
1042// output arguments:
1043// r10d <- 0
1044// r11 <- A+4*k*sizeof(double)
1045// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
1046// r13 <- 4*sdb*sizeof(double)
1047// xmm0 <- [d00 d10]
1048// xmm1 <- [d01 d11]
1049// xmm2 <- [d02 d12]
1050// xmm3 <- [d03 d13]
1051// xmm4 <- [d20 d30]
1052// xmm5 <- [d21 d31]
1053// xmm6 <- [d22 d32]
1054// xmm7 <- [d23 d33]
1055// xmm8 <- dirty
1056// xmm9 <- dirty
1057// xmm10 <- dirty
1058// xmm11 <- dirty
1059// xmm12 <- dirty
1060// xmm13 <- dirty
1061// xmm14 <- dirty
1062// xmm15 <- dirty
1063
1064#if MACRO_LEVEL>=2
1065 .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
1066#else
1067 .p2align 4,,15
1068#if defined(OS_LINUX)
1069 .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
1070inner_kernel_dgemm_add_nn_4x4_lib4:
1071#elif defined(OS_MAC)
1072_inner_kernel_dgemm_add_nn_4x4_lib4:
1073#elif defined(OS_WINDOWS)
1074 .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
1075inner_kernel_dgemm_add_nn_4x4_lib4:
1076#endif
1077#endif
1078
1079 cmpl $0, %r10d
1080 jle 2f // return
1081
1082 // prefetch
1083 movapd 0(%r11), %xmm8 // A[0]
1084 movapd 16(%r11), %xmm9 // A[2]
1085
1086 xorpd %xmm11, %xmm11
1087 movapd %xmm11, %xmm12
1088 movapd %xmm11, %xmm13
1089 movapd %xmm11, %xmm14
1090 movapd %xmm11, %xmm15
1091
1092
1093 cmpl $4, %r10d
1094 jle 0f // consider clean-up loop
1095
1096 // main loop
1097 .p2align 3
10981: // main loop
1099
1100 prefetcht0 0(%r12, %r13, 2) // software prefetch
1101 prefetcht0 64(%r12, %r13, 2) // software prefetch
1102
1103 // unroll 0
1104 movddup 0(%r12), %xmm10 // B[0]
1105 addpd %xmm14, %xmm2
1106 addpd %xmm11, %xmm6
1107 movapd %xmm10, %xmm11
1108 mulpd %xmm8, %xmm10
1109 mulpd %xmm9, %xmm11
1110
1111 movddup 32(%r12), %xmm15 // B[4]
1112 addpd %xmm12, %xmm3
1113 addpd %xmm13, %xmm7
1114 movapd %xmm15, %xmm13
1115 mulpd %xmm8, %xmm15
1116 mulpd %xmm9, %xmm13
1117
1118 movddup 64(%r12), %xmm14 // B[8]
1119 addpd %xmm10, %xmm0
1120 addpd %xmm11, %xmm4
1121 movapd %xmm14, %xmm11
1122 mulpd %xmm8, %xmm14
1123 mulpd %xmm9, %xmm11
1124
1125 movddup 96(%r12), %xmm12 // B[12]
1126 addpd %xmm15, %xmm1
1127 addpd %xmm13, %xmm5
1128 movapd %xmm12, %xmm13
1129 mulpd %xmm8, %xmm12
1130 movapd 32(%r11), %xmm8 // A[4]
1131 mulpd %xmm9, %xmm13
1132 movapd 48(%r11), %xmm9 // A[6]
1133
1134
1135 // unroll 1
1136 movddup 8(%r12), %xmm10 // B[1]
1137 addpd %xmm14, %xmm2
1138 addpd %xmm11, %xmm6
1139 movapd %xmm10, %xmm11
1140 mulpd %xmm8, %xmm10
1141 mulpd %xmm9, %xmm11
1142
1143 movddup 40(%r12), %xmm15 // B[5]
1144 addpd %xmm12, %xmm3
1145 addpd %xmm13, %xmm7
1146 movapd %xmm15, %xmm13
1147 mulpd %xmm8, %xmm15
1148 mulpd %xmm9, %xmm13
1149
1150 movddup 72(%r12), %xmm14 // B[9]
1151 addpd %xmm10, %xmm0
1152 addpd %xmm11, %xmm4
1153 movapd %xmm14, %xmm11
1154 mulpd %xmm8, %xmm14
1155 mulpd %xmm9, %xmm11
1156
1157 movddup 104(%r12), %xmm12 // B[13]
1158 addpd %xmm15, %xmm1
1159 addpd %xmm13, %xmm5
1160 movapd %xmm12, %xmm13
1161 mulpd %xmm8, %xmm12
1162 movapd 64(%r11), %xmm8 // A[8]
1163 mulpd %xmm9, %xmm13
1164 movapd 80(%r11), %xmm9 // A[10]
1165
1166
1167 // unroll 2
1168 movddup 16(%r12), %xmm10 // B[2]
1169 addpd %xmm14, %xmm2
1170 addpd %xmm11, %xmm6
1171 movapd %xmm10, %xmm11
1172 mulpd %xmm8, %xmm10
1173 mulpd %xmm9, %xmm11
1174 subl $4, %r10d
1175
1176 movddup 48(%r12), %xmm15 // B[6]
1177 addpd %xmm12, %xmm3
1178 addpd %xmm13, %xmm7
1179 movapd %xmm15, %xmm13
1180 mulpd %xmm8, %xmm15
1181 mulpd %xmm9, %xmm13
1182
1183 movddup 80(%r12), %xmm14 // B[10]
1184 addpd %xmm10, %xmm0
1185 addpd %xmm11, %xmm4
1186 movapd %xmm14, %xmm11
1187 mulpd %xmm8, %xmm14
1188 mulpd %xmm9, %xmm11
1189
1190 movddup 112(%r12), %xmm12 // B[14]
1191 addpd %xmm15, %xmm1
1192 addpd %xmm13, %xmm5
1193 movapd %xmm12, %xmm13
1194 mulpd %xmm8, %xmm12
1195 movapd 96(%r11), %xmm8 // A[12]
1196 mulpd %xmm9, %xmm13
1197 movapd 112(%r11), %xmm9 // A[14]
1198
1199
1200 // unroll 3
1201 movddup 24(%r12), %xmm10 // B[3]
1202 addpd %xmm14, %xmm2
1203 addpd %xmm11, %xmm6
1204 movapd %xmm10, %xmm11
1205 mulpd %xmm8, %xmm10
1206 mulpd %xmm9, %xmm11
1207
1208 movddup 56(%r12), %xmm15 // B[7]
1209 addpd %xmm12, %xmm3
1210 addpd %xmm13, %xmm7
1211 movapd %xmm15, %xmm13
1212 mulpd %xmm8, %xmm15
1213 mulpd %xmm9, %xmm13
1214 addq $128, %r11 // A += 16
1215
1216 movddup 88(%r12), %xmm14 // B[11]
1217 addpd %xmm10, %xmm0
1218 addpd %xmm11, %xmm4
1219 movapd %xmm14, %xmm11
1220 mulpd %xmm8, %xmm14
1221 mulpd %xmm9, %xmm11
1222
1223 movddup 120(%r12), %xmm12 // B[15]
1224 addpd %xmm15, %xmm1
1225 addpd %xmm13, %xmm5
1226 movapd %xmm12, %xmm13
1227 mulpd %xmm8, %xmm12
1228 movapd 0(%r11), %xmm8 // A[0]
1229 mulpd %xmm9, %xmm13
1230 movapd 16(%r11), %xmm9 // A[2]
1231 addq %r13, %r12 // B += ...
1232
1233
1234 cmpl $4, %r10d
1235 jg 1b // main loop
1236
1237
12380: // consider clean4-up
1239
1240 cmpl $3, %r10d
1241 jle 4f // clean1
1242
1243
1244 // unroll 0
1245 movddup 0(%r12), %xmm10 // B[0]
1246 addpd %xmm14, %xmm2
1247 addpd %xmm11, %xmm6
1248 movapd %xmm10, %xmm11
1249 mulpd %xmm8, %xmm10
1250 mulpd %xmm9, %xmm11
1251
1252 movddup 32(%r12), %xmm15 // B[4]
1253 addpd %xmm12, %xmm3
1254 addpd %xmm13, %xmm7
1255 movapd %xmm15, %xmm13
1256 mulpd %xmm8, %xmm15
1257 mulpd %xmm9, %xmm13
1258
1259 movddup 64(%r12), %xmm14 // B[8]
1260 addpd %xmm10, %xmm0
1261 addpd %xmm11, %xmm4
1262 movapd %xmm14, %xmm11
1263 mulpd %xmm8, %xmm14
1264 mulpd %xmm9, %xmm11
1265
1266 movddup 96(%r12), %xmm12 // B[12]
1267 addpd %xmm15, %xmm1
1268 addpd %xmm13, %xmm5
1269 movapd %xmm12, %xmm13
1270 mulpd %xmm8, %xmm12
1271 movapd 32(%r11), %xmm8 // A[4]
1272 mulpd %xmm9, %xmm13
1273 movapd 48(%r11), %xmm9 // A[6]
1274
1275
1276 // unroll 1
1277 movddup 8(%r12), %xmm10 // B[1]
1278 addpd %xmm14, %xmm2
1279 addpd %xmm11, %xmm6
1280 movapd %xmm10, %xmm11
1281 mulpd %xmm8, %xmm10
1282 mulpd %xmm9, %xmm11
1283
1284 movddup 40(%r12), %xmm15 // B[5]
1285 addpd %xmm12, %xmm3
1286 addpd %xmm13, %xmm7
1287 movapd %xmm15, %xmm13
1288 mulpd %xmm8, %xmm15
1289 mulpd %xmm9, %xmm13
1290
1291 movddup 72(%r12), %xmm14 // B[9]
1292 addpd %xmm10, %xmm0
1293 addpd %xmm11, %xmm4
1294 movapd %xmm14, %xmm11
1295 mulpd %xmm8, %xmm14
1296 mulpd %xmm9, %xmm11
1297
1298 movddup 104(%r12), %xmm12 // B[13]
1299 addpd %xmm15, %xmm1
1300 addpd %xmm13, %xmm5
1301 movapd %xmm12, %xmm13
1302 mulpd %xmm8, %xmm12
1303 movapd 64(%r11), %xmm8 // A[8]
1304 mulpd %xmm9, %xmm13
1305 movapd 80(%r11), %xmm9 // A[10]
1306
1307
1308 // unroll 2
1309 movddup 16(%r12), %xmm10 // B[2]
1310 addpd %xmm14, %xmm2
1311 addpd %xmm11, %xmm6
1312 movapd %xmm10, %xmm11
1313 mulpd %xmm8, %xmm10
1314 mulpd %xmm9, %xmm11
1315 subl $4, %r10d
1316
1317 movddup 48(%r12), %xmm15 // B[6]
1318 addpd %xmm12, %xmm3
1319 addpd %xmm13, %xmm7
1320 movapd %xmm15, %xmm13
1321 mulpd %xmm8, %xmm15
1322 mulpd %xmm9, %xmm13
1323
1324 movddup 80(%r12), %xmm14 // B[10]
1325 addpd %xmm10, %xmm0
1326 addpd %xmm11, %xmm4
1327 movapd %xmm14, %xmm11
1328 mulpd %xmm8, %xmm14
1329 mulpd %xmm9, %xmm11
1330
1331 movddup 112(%r12), %xmm12 // B[14]
1332 addpd %xmm15, %xmm1
1333 addpd %xmm13, %xmm5
1334 movapd %xmm12, %xmm13
1335 mulpd %xmm8, %xmm12
1336 movapd 96(%r11), %xmm8 // A[12]
1337 mulpd %xmm9, %xmm13
1338 movapd 112(%r11), %xmm9 // A[14]
1339
1340
1341 // unroll 3
1342 movddup 24(%r12), %xmm10 // B[3]
1343 addpd %xmm14, %xmm2
1344 addpd %xmm11, %xmm6
1345 movapd %xmm10, %xmm11
1346 mulpd %xmm8, %xmm10
1347 mulpd %xmm9, %xmm11
1348
1349 movddup 56(%r12), %xmm15 // B[7]
1350 addpd %xmm12, %xmm3
1351 addpd %xmm13, %xmm7
1352 movapd %xmm15, %xmm13
1353 mulpd %xmm8, %xmm15
1354 mulpd %xmm9, %xmm13
1355 addq $128, %r11 // A += 16
1356
1357 movddup 88(%r12), %xmm14 // B[11]
1358 addpd %xmm10, %xmm0
1359 addpd %xmm11, %xmm4
1360 movapd %xmm14, %xmm11
1361 mulpd %xmm8, %xmm14
1362 mulpd %xmm9, %xmm11
1363
1364 movddup 120(%r12), %xmm12 // B[15]
1365 addpd %xmm15, %xmm1
1366 addpd %xmm13, %xmm5
1367 movapd %xmm12, %xmm13
1368 mulpd %xmm8, %xmm12
1369// movapd 0(%r11), %xmm8 // A[0]
1370 mulpd %xmm9, %xmm13
1371// movapd 16(%r11), %xmm9 // A[2]
1372 addq %r13, %r12 // B += ...
1373
1374
1375 // clean accumulators
1376 addpd %xmm14, %xmm2
1377 addpd %xmm11, %xmm6
1378 addpd %xmm12, %xmm3
1379 addpd %xmm13, %xmm7
1380
1381
1382 jmp 2f
1383
1384
13854: // consider clean1-up loop
1386
1387 cmpl $0, %r10d
1388 jle 2f // return
1389
1390 // clean-up loop
13913: // clean up loop
1392
1393
1394 // unroll 0
1395 movapd 0(%r11), %xmm8 // A[0]
1396 movapd 16(%r11), %xmm9 // A[2]
1397
1398 movddup 0(%r12), %xmm10 // B[0]
1399 addpd %xmm14, %xmm2
1400 addpd %xmm11, %xmm6
1401 movapd %xmm10, %xmm11
1402 mulpd %xmm8, %xmm10
1403 mulpd %xmm9, %xmm11
1404 subl $1, %r10d
1405
1406 movddup 32(%r12), %xmm15 // B[4]
1407 addpd %xmm12, %xmm3
1408 addpd %xmm13, %xmm7
1409 movapd %xmm15, %xmm13
1410 mulpd %xmm8, %xmm15
1411 mulpd %xmm9, %xmm13
1412
1413 movddup 64(%r12), %xmm14 // B[8]
1414 addpd %xmm10, %xmm0
1415 addpd %xmm11, %xmm4
1416 movapd %xmm14, %xmm11
1417 mulpd %xmm8, %xmm14
1418 mulpd %xmm9, %xmm11
1419 addq $32, %r11
1420
1421 movddup 96(%r12), %xmm12 // B[12]
1422 addpd %xmm15, %xmm1
1423 addpd %xmm13, %xmm5
1424 movapd %xmm12, %xmm13
1425 mulpd %xmm8, %xmm12
1426 mulpd %xmm9, %xmm13
1427 addq $8, %r12
1428
1429 cmpl $0, %r10d
1430 jg 3b // clean up loop
1431
1432
1433 // clean accumulators
1434 addpd %xmm14, %xmm2
1435 addpd %xmm11, %xmm6
1436 addpd %xmm12, %xmm3
1437 addpd %xmm13, %xmm7
1438
1439
14402: // return
1441
1442#if MACRO_LEVEL>=2
1443 .endm
1444#else
1445 ret
1446
1447#if defined(OS_LINUX)
1448 .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
1449#endif
1450#endif
1451
1452
1453
1454
1455
1456// common inner routine with file scope
1457//
1458// edge for B unaligned
1459//
1460// input arguments:
1461// r10 <- k
1462// r11 <- A
1463// r12 <- B
1464// r13 <- bs*sdb*sizeof(double)
1465// r14 <- offB
1466// xmm0 <- [d00 d10]
1467// xmm1 <- [d01 d11]
1468// xmm2 <- [d02 d12]
1469// xmm3 <- [d03 d13]
1470// xmm4 <- [d20 d30]
1471// xmm5 <- [d21 d31]
1472// xmm6 <- [d22 d32]
1473// xmm7 <- [d23 d33]
1474// ymm8 <- dirty
1475// ymm12 <- dirty
1476// ymm15 <- dirty
1477
1478//
1479// output arguments:
1480// r10 <- k-(4-offB)
1481// r11 <- A+(4-offB)*bs*sizeof(double)
1482// r12 <- B-offB+bs*sdb*sizeof(double)
1483// r13 <- bs*sdb*sizeof(double)
1484// r14 <- offB
1485// xmm0 <- [d00 d10]
1486// xmm1 <- [d01 d11]
1487// xmm2 <- [d02 d12]
1488// xmm3 <- [d03 d13]
1489// xmm4 <- [d20 d30]
1490// xmm5 <- [d21 d31]
1491// xmm6 <- [d22 d32]
1492// xmm7 <- [d23 d33]
1493// ymm8 <- dirty
1494// ymm12 <- dirty
1495// ymm15 <- dirty
1496
1497
1498#if MACRO_LEVEL>=1
1499 .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
1500#else
1501 .p2align 4,,15
1502#if defined(OS_LINUX)
1503 .type inner_edge_dgemm_add_nn_4x4_lib4, @function
1504inner_edge_dgemm_add_nn_4x4_lib4:
1505#elif defined(OS_MAC)
1506_inner_edge_dgemm_add_nn_4x4_lib4:
1507#elif defined(OS_WINDOWS)
1508 .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
1509inner_edge_dgemm_add_nn_4x4_lib4:
1510#endif
1511#endif
1512
1513 cmpl $0, %r14d // offset==0
1514 jle 2f // end
1515
1516 cmpl $0, %r10d // k==0
1517 jle 2f // end
1518
1519 movl $4, %r15d
1520 subl %r14d, %r15d // 4-offsetB
1521 cmpl %r10d, %r15d
1522// jle 0f
1523// movl %r10d, %r15d // kend=min(k,4-offsetB)
1524//0:
1525 cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
1526
1527 movl %r14d, %eax
1528 sall $3, %eax // offsetB*sizeof(double)
1529 addq %rax, %r12 // B+offsetB*sizeof(double)
1530
15311:
1532 movapd 0(%r11), %xmm8 // A[0]
1533 movapd 16(%r11), %xmm9 // A[2]
1534
1535 movddup 0(%r12), %xmm10 // B[0]
1536 movapd %xmm10, %xmm11
1537 mulpd %xmm8, %xmm10
1538 mulpd %xmm9, %xmm11
1539 addpd %xmm10, %xmm0
1540 addpd %xmm11, %xmm4
1541
1542 movddup 32(%r12), %xmm15 // B[4]
1543 movapd %xmm15, %xmm13
1544 mulpd %xmm8, %xmm15
1545 mulpd %xmm9, %xmm13
1546 addpd %xmm15, %xmm1
1547 addpd %xmm13, %xmm5
1548
1549 movddup 64(%r12), %xmm14 // B[8]
1550 movapd %xmm14, %xmm11
1551 mulpd %xmm8, %xmm14
1552 mulpd %xmm9, %xmm11
1553 addpd %xmm14, %xmm2
1554 addpd %xmm11, %xmm6
1555
1556 movddup 96(%r12), %xmm12 // B[12]
1557 movapd %xmm12, %xmm13
1558 mulpd %xmm8, %xmm12
1559 mulpd %xmm9, %xmm13
1560 addpd %xmm12, %xmm3
1561 addpd %xmm13, %xmm7
1562
1563 subl $1, %r10d // k-1
1564 subl $1, %r15d // kend-1
1565 addq $32, %r11 // A+1*bs*sizeof(float)
1566 addq $8, %r12 // B+1*sizeof(float)
1567
1568 cmpl $0, %r15d
1569 jg 1b
1570
1571 cmpl $0, %r10d
1572 jle 2f // end
1573
1574 addq %r13, %r12
1575 subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
1576
15772:
1578
1579#if MACRO_LEVEL>=1
1580 .endm
1581#else
1582 ret
1583
1584#if defined(OS_LINUX)
1585 .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
1586#endif
1587#endif
1588
1589
1590
1591
1592
1593// common inner routine with file scope
1594//
1595// edge for B lower triangular
1596//
1597// input arguments:
1598// r10 <- k
1599// r11 <- A
1600// r12 <- B
1601// r13 <- bs*sdb*sizeof(double)
1602// r14 <- offB
1603// xmm0 <- [d00 d10]
1604// xmm1 <- [d01 d11]
1605// xmm2 <- [d02 d12]
1606// xmm3 <- [d03 d13]
1607// xmm4 <- [d20 d30]
1608// xmm5 <- [d21 d31]
1609// xmm6 <- [d22 d32]
1610// xmm7 <- [d23 d33]
1611// ymm8 <- dirty
1612// ymm12 <- dirty
1613// ymm15 <- dirty
1614
1615//
1616// output arguments:
1617// r10 <- k-(4-offB)
1618// r11 <- A+(4-offB)*bs*sizeof(double)
1619// r12 <- B-offB+bs*sdb*sizeof(double)
1620// r13 <- bs*sdb*sizeof(double)
1621// r14 <- offB
1622// xmm0 <- [d00 d10]
1623// xmm1 <- [d01 d11]
1624// xmm2 <- [d02 d12]
1625// xmm3 <- [d03 d13]
1626// xmm4 <- [d20 d30]
1627// xmm5 <- [d21 d31]
1628// xmm6 <- [d22 d32]
1629// xmm7 <- [d23 d33]
1630// ymm8 <- dirty
1631// ymm12 <- dirty
1632// ymm15 <- dirty
1633
1634
1635#if MACRO_LEVEL>=1
1636 .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
1637#else
1638 .p2align 4,,15
1639#if defined(OS_LINUX)
1640 .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
1641inner_edge_dtrmm_nn_rl_4x4_lib4:
1642#elif defined(OS_MAC)
1643_inner_edge_dtrmm_nn_rl_4x4_lib4:
1644#elif defined(OS_WINDOWS)
1645 .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
1646inner_edge_dtrmm_nn_rl_4x4_lib4:
1647#endif
1648#endif
1649
1650 cmpl $0, %r14d
1651 jg 0f
1652
1653 // offB==0
1654
1655 // unroll 0
1656 movapd 0(%r11), %xmm8 // A[0]
1657 movapd 16(%r11), %xmm9 // A[2]
1658
1659 movddup 0(%r12), %xmm10 // B[0]
1660 movapd %xmm10, %xmm11
1661 mulpd %xmm8, %xmm10
1662 mulpd %xmm9, %xmm11
1663 addpd %xmm10, %xmm0
1664 addpd %xmm11, %xmm4
1665
1666 // unroll 1
1667 movapd 32(%r11), %xmm8 // A[0]
1668 movapd 48(%r11), %xmm9 // A[2]
1669
1670 movddup 8(%r12), %xmm10 // B[0]
1671 movapd %xmm10, %xmm11
1672 mulpd %xmm8, %xmm10
1673 mulpd %xmm9, %xmm11
1674 addpd %xmm10, %xmm0
1675 addpd %xmm11, %xmm4
1676
1677 movddup 40(%r12), %xmm15 // B[4]
1678 movapd %xmm15, %xmm13
1679 mulpd %xmm8, %xmm15
1680 mulpd %xmm9, %xmm13
1681 addpd %xmm15, %xmm1
1682 addpd %xmm13, %xmm5
1683
1684 // unroll 2
1685 movapd 64(%r11), %xmm8 // A[0]
1686 movapd 80(%r11), %xmm9 // A[2]
1687
1688 movddup 16(%r12), %xmm10 // B[0]
1689 movapd %xmm10, %xmm11
1690 mulpd %xmm8, %xmm10
1691 mulpd %xmm9, %xmm11
1692 addpd %xmm10, %xmm0
1693 addpd %xmm11, %xmm4
1694
1695 movddup 48(%r12), %xmm15 // B[4]
1696 movapd %xmm15, %xmm13
1697 mulpd %xmm8, %xmm15
1698 mulpd %xmm9, %xmm13
1699 addpd %xmm15, %xmm1
1700 addpd %xmm13, %xmm5
1701
1702 movddup 80(%r12), %xmm14 // B[8]
1703 movapd %xmm14, %xmm11
1704 mulpd %xmm8, %xmm14
1705 mulpd %xmm9, %xmm11
1706 addpd %xmm14, %xmm2
1707 addpd %xmm11, %xmm6
1708
1709 // unroll 3
1710 movapd 96(%r11), %xmm8 // A[0]
1711 movapd 112(%r11), %xmm9 // A[2]
1712
1713 movddup 24(%r12), %xmm10 // B[0]
1714 movapd %xmm10, %xmm11
1715 mulpd %xmm8, %xmm10
1716 mulpd %xmm9, %xmm11
1717 addpd %xmm10, %xmm0
1718 addpd %xmm11, %xmm4
1719
1720 movddup 56(%r12), %xmm15 // B[4]
1721 movapd %xmm15, %xmm13
1722 mulpd %xmm8, %xmm15
1723 mulpd %xmm9, %xmm13
1724 addpd %xmm15, %xmm1
1725 addpd %xmm13, %xmm5
1726
1727 movddup 88(%r12), %xmm14 // B[8]
1728 movapd %xmm14, %xmm11
1729 mulpd %xmm8, %xmm14
1730 mulpd %xmm9, %xmm11
1731 addpd %xmm14, %xmm2
1732 addpd %xmm11, %xmm6
1733
1734 movddup 120(%r12), %xmm12 // B[12]
1735 movapd %xmm12, %xmm13
1736 mulpd %xmm8, %xmm12
1737 mulpd %xmm9, %xmm13
1738 addpd %xmm12, %xmm3
1739 addpd %xmm13, %xmm7
1740
1741 subl $4, %r10d // k-4
1742 addq $128, %r11 // A+4*bs*sizeof(double)
1743 addq %r13, %r12 // B+bs*sdb*sizeof(double)
1744
1745 jmp 3f
1746
17470:
1748 cmpl $1, %r14d
1749 jg 1f
1750
1751 // offB==1
1752
1753 addq $8, %r12 // B+1*sizeof(double)
1754
1755 // unroll 0
1756 movapd 0(%r11), %xmm8 // A[0]
1757 movapd 16(%r11), %xmm9 // A[2]
1758
1759 movddup 0(%r12), %xmm10 // B[0]
1760 movapd %xmm10, %xmm11
1761 mulpd %xmm8, %xmm10
1762 mulpd %xmm9, %xmm11
1763 addpd %xmm10, %xmm0
1764 addpd %xmm11, %xmm4
1765
1766 // unroll 1
1767 movapd 32(%r11), %xmm8 // A[0]
1768 movapd 48(%r11), %xmm9 // A[2]
1769
1770 movddup 8(%r12), %xmm10 // B[0]
1771 movapd %xmm10, %xmm11
1772 mulpd %xmm8, %xmm10
1773 mulpd %xmm9, %xmm11
1774 addpd %xmm10, %xmm0
1775 addpd %xmm11, %xmm4
1776
1777 movddup 40(%r12), %xmm15 // B[4]
1778 movapd %xmm15, %xmm13
1779 mulpd %xmm8, %xmm15
1780 mulpd %xmm9, %xmm13
1781 addpd %xmm15, %xmm1
1782 addpd %xmm13, %xmm5
1783
1784 // unroll 2
1785 movapd 64(%r11), %xmm8 // A[0]
1786 movapd 80(%r11), %xmm9 // A[2]
1787
1788 movddup 16(%r12), %xmm10 // B[0]
1789 movapd %xmm10, %xmm11
1790 mulpd %xmm8, %xmm10
1791 mulpd %xmm9, %xmm11
1792 addpd %xmm10, %xmm0
1793 addpd %xmm11, %xmm4
1794
1795 movddup 48(%r12), %xmm15 // B[4]
1796 movapd %xmm15, %xmm13
1797 mulpd %xmm8, %xmm15
1798 mulpd %xmm9, %xmm13
1799 addpd %xmm15, %xmm1
1800 addpd %xmm13, %xmm5
1801
1802 movddup 80(%r12), %xmm14 // B[8]
1803 movapd %xmm14, %xmm11
1804 mulpd %xmm8, %xmm14
1805 mulpd %xmm9, %xmm11
1806 addpd %xmm14, %xmm2
1807 addpd %xmm11, %xmm6
1808
1809 subl $3, %r10d // k-3
1810 addq $96, %r11 // A+3*bs*sizeof(double)
1811 addq %r13, %r12
1812 subq $8, %r12 // B+bs*sdb*sizeof(double)-1
1813
1814 jmp 3f
1815
18161:
1817 cmpl $2, %r14d
1818 jg 2f
1819
1820 // offB==2
1821
1822 addq $16, %r12 // B+2*sizeof(double)
1823
1824 // unroll 0
1825 movapd 0(%r11), %xmm8 // A[0]
1826 movapd 16(%r11), %xmm9 // A[2]
1827
1828 movddup 0(%r12), %xmm10 // B[0]
1829 movapd %xmm10, %xmm11
1830 mulpd %xmm8, %xmm10
1831 mulpd %xmm9, %xmm11
1832 addpd %xmm10, %xmm0
1833 addpd %xmm11, %xmm4
1834
1835 // unroll 1
1836 movapd 32(%r11), %xmm8 // A[0]
1837 movapd 48(%r11), %xmm9 // A[2]
1838
1839 movddup 8(%r12), %xmm10 // B[0]
1840 movapd %xmm10, %xmm11
1841 mulpd %xmm8, %xmm10
1842 mulpd %xmm9, %xmm11
1843 addpd %xmm10, %xmm0
1844 addpd %xmm11, %xmm4
1845
1846 movddup 40(%r12), %xmm15 // B[4]
1847 movapd %xmm15, %xmm13
1848 mulpd %xmm8, %xmm15
1849 mulpd %xmm9, %xmm13
1850 addpd %xmm15, %xmm1
1851 addpd %xmm13, %xmm5
1852
1853 subl $2, %r10d // k-2
1854 addq $64, %r11 // A+2*bs*sizeof(double)
1855 addq %r13, %r12
1856 subq $16, %r12 // B+bs*sdb*sizeof(double)-2
1857
1858 // unroll 2
1859 movapd 0(%r11), %xmm8 // A[0]
1860 movapd 16(%r11), %xmm9 // A[2]
1861
1862 movddup 0(%r12), %xmm10 // B[0]
1863 movapd %xmm10, %xmm11
1864 mulpd %xmm8, %xmm10
1865 mulpd %xmm9, %xmm11
1866 addpd %xmm10, %xmm0
1867 addpd %xmm11, %xmm4
1868
1869 movddup 32(%r12), %xmm15 // B[4]
1870 movapd %xmm15, %xmm13
1871 mulpd %xmm8, %xmm15
1872 mulpd %xmm9, %xmm13
1873 addpd %xmm15, %xmm1
1874 addpd %xmm13, %xmm5
1875
1876 movddup 64(%r12), %xmm14 // B[8]
1877 movapd %xmm14, %xmm11
1878 mulpd %xmm8, %xmm14
1879 mulpd %xmm9, %xmm11
1880 addpd %xmm14, %xmm2
1881 addpd %xmm11, %xmm6
1882
1883 // unroll 3
1884 movapd 32(%r11), %xmm8 // A[0]
1885 movapd 48(%r11), %xmm9 // A[2]
1886
1887 movddup 8(%r12), %xmm10 // B[0]
1888 movapd %xmm10, %xmm11
1889 mulpd %xmm8, %xmm10
1890 mulpd %xmm9, %xmm11
1891 addpd %xmm10, %xmm0
1892 addpd %xmm11, %xmm4
1893
1894 movddup 40(%r12), %xmm15 // B[4]
1895 movapd %xmm15, %xmm13
1896 mulpd %xmm8, %xmm15
1897 mulpd %xmm9, %xmm13
1898 addpd %xmm15, %xmm1
1899 addpd %xmm13, %xmm5
1900
1901 movddup 72(%r12), %xmm14 // B[8]
1902 movapd %xmm14, %xmm11
1903 mulpd %xmm8, %xmm14
1904 mulpd %xmm9, %xmm11
1905 addpd %xmm14, %xmm2
1906 addpd %xmm11, %xmm6
1907
1908 movddup 104(%r12), %xmm12 // B[12]
1909 movapd %xmm12, %xmm13
1910 mulpd %xmm8, %xmm12
1911 mulpd %xmm9, %xmm13
1912 addpd %xmm12, %xmm3
1913 addpd %xmm13, %xmm7
1914
1915 // unroll 4
1916 movapd 64(%r11), %xmm8 // A[0]
1917 movapd 80(%r11), %xmm9 // A[2]
1918
1919 movddup 16(%r12), %xmm10 // B[0]
1920 movapd %xmm10, %xmm11
1921 mulpd %xmm8, %xmm10
1922 mulpd %xmm9, %xmm11
1923 addpd %xmm10, %xmm0
1924 addpd %xmm11, %xmm4
1925
1926 movddup 48(%r12), %xmm15 // B[4]
1927 movapd %xmm15, %xmm13
1928 mulpd %xmm8, %xmm15
1929 mulpd %xmm9, %xmm13
1930 addpd %xmm15, %xmm1
1931 addpd %xmm13, %xmm5
1932
1933 movddup 80(%r12), %xmm14 // B[8]
1934 movapd %xmm14, %xmm11
1935 mulpd %xmm8, %xmm14
1936 mulpd %xmm9, %xmm11
1937 addpd %xmm14, %xmm2
1938 addpd %xmm11, %xmm6
1939
1940 movddup 112(%r12), %xmm12 // B[12]
1941 movapd %xmm12, %xmm13
1942 mulpd %xmm8, %xmm12
1943 mulpd %xmm9, %xmm13
1944 addpd %xmm12, %xmm3
1945 addpd %xmm13, %xmm7
1946
1947 // unroll 5
1948 movapd 96(%r11), %xmm8 // A[0]
1949 movapd 112(%r11), %xmm9 // A[2]
1950
1951 movddup 24(%r12), %xmm10 // B[0]
1952 movapd %xmm10, %xmm11
1953 mulpd %xmm8, %xmm10
1954 mulpd %xmm9, %xmm11
1955 addpd %xmm10, %xmm0
1956 addpd %xmm11, %xmm4
1957
1958 movddup 56(%r12), %xmm15 // B[4]
1959 movapd %xmm15, %xmm13
1960 mulpd %xmm8, %xmm15
1961 mulpd %xmm9, %xmm13
1962 addpd %xmm15, %xmm1
1963 addpd %xmm13, %xmm5
1964
1965 movddup 88(%r12), %xmm14 // B[8]
1966 movapd %xmm14, %xmm11
1967 mulpd %xmm8, %xmm14
1968 mulpd %xmm9, %xmm11
1969 addpd %xmm14, %xmm2
1970 addpd %xmm11, %xmm6
1971
1972 movddup 120(%r12), %xmm12 // B[12]
1973 movapd %xmm12, %xmm13
1974 mulpd %xmm8, %xmm12
1975 mulpd %xmm9, %xmm13
1976 addpd %xmm12, %xmm3
1977 addpd %xmm13, %xmm7
1978
1979 subl $4, %r10d // k-4
1980 addq $128, %r11 // A+4*bs*sizeof(double)
1981 addq %r13, %r12 // B+bs*sdb*sizeof(double)
1982
1983 jmp 3f
1984
19852:
1986 // offB==3
1987
1988 addq $24, %r12 // B+3*sizeof(double)
1989
1990 // unroll 0
1991 movapd 0(%r11), %xmm8 // A[0]
1992 movapd 16(%r11), %xmm9 // A[2]
1993
1994 movddup 0(%r12), %xmm10 // B[0]
1995 movapd %xmm10, %xmm11
1996 mulpd %xmm8, %xmm10
1997 mulpd %xmm9, %xmm11
1998 addpd %xmm10, %xmm0
1999 addpd %xmm11, %xmm4
2000
2001 subl $1, %r10d // k-1
2002 addq $32, %r11 // A+1*bs*sizeof(double)
2003 addq %r13, %r12
2004 subq $24, %r12 // B+bs*sdb*sizeof(double)-3
2005
2006 // unroll 1
2007 movapd 0(%r11), %xmm8 // A[0]
2008 movapd 16(%r11), %xmm9 // A[2]
2009
2010 movddup 0(%r12), %xmm10 // B[0]
2011 movapd %xmm10, %xmm11
2012 mulpd %xmm8, %xmm10
2013 mulpd %xmm9, %xmm11
2014 addpd %xmm10, %xmm0
2015 addpd %xmm11, %xmm4
2016
2017 movddup 32(%r12), %xmm15 // B[4]
2018 movapd %xmm15, %xmm13
2019 mulpd %xmm8, %xmm15
2020 mulpd %xmm9, %xmm13
2021 addpd %xmm15, %xmm1
2022 addpd %xmm13, %xmm5
2023
2024 // unroll 2
2025 movapd 32(%r11), %xmm8 // A[0]
2026 movapd 48(%r11), %xmm9 // A[2]
2027
2028 movddup 8(%r12), %xmm10 // B[0]
2029 movapd %xmm10, %xmm11
2030 mulpd %xmm8, %xmm10
2031 mulpd %xmm9, %xmm11
2032 addpd %xmm10, %xmm0
2033 addpd %xmm11, %xmm4
2034
2035 movddup 40(%r12), %xmm15 // B[4]
2036 movapd %xmm15, %xmm13
2037 mulpd %xmm8, %xmm15
2038 mulpd %xmm9, %xmm13
2039 addpd %xmm15, %xmm1
2040 addpd %xmm13, %xmm5
2041
2042 movddup 72(%r12), %xmm14 // B[8]
2043 movapd %xmm14, %xmm11
2044 mulpd %xmm8, %xmm14
2045 mulpd %xmm9, %xmm11
2046 addpd %xmm14, %xmm2
2047 addpd %xmm11, %xmm6
2048
2049 // unroll 3
2050 movapd 64(%r11), %xmm8 // A[0]
2051 movapd 80(%r11), %xmm9 // A[2]
2052
2053 movddup 16(%r12), %xmm10 // B[0]
2054 movapd %xmm10, %xmm11
2055 mulpd %xmm8, %xmm10
2056 mulpd %xmm9, %xmm11
2057 addpd %xmm10, %xmm0
2058 addpd %xmm11, %xmm4
2059
2060 movddup 48(%r12), %xmm15 // B[4]
2061 movapd %xmm15, %xmm13
2062 mulpd %xmm8, %xmm15
2063 mulpd %xmm9, %xmm13
2064 addpd %xmm15, %xmm1
2065 addpd %xmm13, %xmm5
2066
2067 movddup 80(%r12), %xmm14 // B[8]
2068 movapd %xmm14, %xmm11
2069 mulpd %xmm8, %xmm14
2070 mulpd %xmm9, %xmm11
2071 addpd %xmm14, %xmm2
2072 addpd %xmm11, %xmm6
2073
2074 movddup 112(%r12), %xmm12 // B[12]
2075 movapd %xmm12, %xmm13
2076 mulpd %xmm8, %xmm12
2077 mulpd %xmm9, %xmm13
2078 addpd %xmm12, %xmm3
2079 addpd %xmm13, %xmm7
2080
2081 // unroll 4
2082 movapd 96(%r11), %xmm8 // A[0]
2083 movapd 112(%r11), %xmm9 // A[2]
2084
2085 movddup 24(%r12), %xmm10 // B[0]
2086 movapd %xmm10, %xmm11
2087 mulpd %xmm8, %xmm10
2088 mulpd %xmm9, %xmm11
2089 addpd %xmm10, %xmm0
2090 addpd %xmm11, %xmm4
2091
2092 movddup 56(%r12), %xmm15 // B[4]
2093 movapd %xmm15, %xmm13
2094 mulpd %xmm8, %xmm15
2095 mulpd %xmm9, %xmm13
2096 addpd %xmm15, %xmm1
2097 addpd %xmm13, %xmm5
2098
2099 movddup 88(%r12), %xmm14 // B[8]
2100 movapd %xmm14, %xmm11
2101 mulpd %xmm8, %xmm14
2102 mulpd %xmm9, %xmm11
2103 addpd %xmm14, %xmm2
2104 addpd %xmm11, %xmm6
2105
2106 movddup 120(%r12), %xmm12 // B[12]
2107 movapd %xmm12, %xmm13
2108 mulpd %xmm8, %xmm12
2109 mulpd %xmm9, %xmm13
2110 addpd %xmm12, %xmm3
2111 addpd %xmm13, %xmm7
2112
2113 subl $4, %r10d // k-4
2114 addq $128, %r11 // A+4*bs*sizeof(double)
2115 addq %r13, %r12 // B+bs*sdb*sizeof(double)
2116
21173:
2118
2119#if MACRO_LEVEL>=1
2120 .endm
2121#else
2122 ret
2123
2124#if defined(OS_LINUX)
2125 .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
2126#endif
2127#endif
2128
2129
2130
2131
2132
2133// common inner routine with file scope
2134//
2135// edge for B upper triangular
2136//
2137// input arguments:
2138// r10 <- A
2139// r11 <- B
2140// xmm0 <- [d00 d10]
2141// xmm1 <- [d20 d30]
2142// xmm2 <- [d01 d11]
2143// xmm3 <- [d21 d31]
2144// xmm0 <- [d02 d12]
2145// xmm1 <- [d22 d32]
2146// xmm2 <- [d03 d13]
2147// xmm3 <- [d23 d33]
2148// xmm8 <- dirty
2149// xmm9 <- dirty
2150// xmm10 <- dirty
2151// xmm11 <- dirty
2152// xmm12 <- dirty
2153// xmm13 <- dirty
2154// xmm14 <- dirty
2155// xmm15 <- dirty
2156
2157//
2158// output arguments:
2159// r10 <- A+4*4*sizeof(double)
2160// r11 <- B+4*4*sizeof(double)
2161// xmm0 <- [d00 d10]
2162// xmm1 <- [d20 d30]
2163// xmm2 <- [d01 d11]
2164// xmm3 <- [d21 d31]
2165// xmm0 <- [d02 d12]
2166// xmm1 <- [d22 d32]
2167// xmm2 <- [d03 d13]
2168// xmm3 <- [d23 d33]
2169// xmm8 <- dirty
2170// xmm9 <- dirty
2171// xmm10 <- dirty
2172// xmm11 <- dirty
2173// xmm12 <- dirty
2174// xmm13 <- dirty
2175// xmm14 <- dirty
2176// xmm15 <- dirty
2177
2178
2179#if MACRO_LEVEL>=1
2180 .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
2181#else
2182 .p2align 4,,15
2183#if defined(OS_LINUX)
2184 .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
2185inner_edge_dtrmm_nt_ru_4x4_lib4:
2186#elif defined(OS_MAC)
2187_inner_edge_dtrmm_nt_ru_4x4_lib4:
2188#elif defined(OS_WINDOWS)
2189 .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
2190inner_edge_dtrmm_nt_ru_4x4_lib4:
2191#endif
2192#endif
2193
2194 movapd 0(%r10), %xmm8
2195 movapd 16(%r10), %xmm9
2196 movddup 0(%r11), %xmm12
2197 movapd %xmm12, %xmm13
2198 mulpd %xmm8, %xmm12
2199 mulpd %xmm9, %xmm13
2200 addpd %xmm12, %xmm0
2201 addpd %xmm13, %xmm4
2202
2203 movapd 32(%r10), %xmm8
2204 movapd 48(%r10), %xmm9
2205 movddup 32(%r11), %xmm12
2206 movapd %xmm12, %xmm13
2207 mulpd %xmm8, %xmm12
2208 mulpd %xmm9, %xmm13
2209 addpd %xmm12, %xmm0
2210 addpd %xmm13, %xmm4
2211 movddup 40(%r11), %xmm12
2212 movapd %xmm12, %xmm13
2213 mulpd %xmm8, %xmm12
2214 mulpd %xmm9, %xmm13
2215 addpd %xmm12, %xmm1
2216 addpd %xmm13, %xmm5
2217
2218 movapd 64(%r10), %xmm8
2219 movapd 80(%r10), %xmm9
2220 movddup 64(%r11), %xmm12
2221 movapd %xmm12, %xmm13
2222 mulpd %xmm8, %xmm12
2223 mulpd %xmm9, %xmm13
2224 addpd %xmm12, %xmm0
2225 addpd %xmm13, %xmm4
2226 movddup 72(%r11), %xmm12
2227 movapd %xmm12, %xmm13
2228 mulpd %xmm8, %xmm12
2229 mulpd %xmm9, %xmm13
2230 addpd %xmm12, %xmm1
2231 addpd %xmm13, %xmm5
2232 movddup 80(%r11), %xmm12
2233 movapd %xmm12, %xmm13
2234 mulpd %xmm8, %xmm12
2235 mulpd %xmm9, %xmm13
2236 addpd %xmm12, %xmm2
2237 addpd %xmm13, %xmm6
2238
2239 movapd 96(%r10), %xmm8
2240 movapd 112(%r10), %xmm9
2241 movddup 96(%r11), %xmm12
2242 movapd %xmm12, %xmm13
2243 mulpd %xmm8, %xmm12
2244 mulpd %xmm9, %xmm13
2245 addpd %xmm12, %xmm0
2246 addpd %xmm13, %xmm4
2247 movddup 104(%r11), %xmm12
2248 movapd %xmm12, %xmm13
2249 mulpd %xmm8, %xmm12
2250 mulpd %xmm9, %xmm13
2251 addpd %xmm12, %xmm1
2252 addpd %xmm13, %xmm5
2253 movddup 112(%r11), %xmm12
2254 movapd %xmm12, %xmm13
2255 mulpd %xmm8, %xmm12
2256 mulpd %xmm9, %xmm13
2257 addpd %xmm12, %xmm2
2258 addpd %xmm13, %xmm6
2259 movddup 120(%r11), %xmm12
2260 movapd %xmm12, %xmm13
2261 mulpd %xmm8, %xmm12
2262 mulpd %xmm9, %xmm13
2263 addpd %xmm12, %xmm3
2264 addpd %xmm13, %xmm7
2265
2266 addq $128, %r10
2267 addq $128, %r11
2268
2269#if MACRO_LEVEL>=1
2270 .endm
2271#else
2272 ret
2273
2274#if defined(OS_LINUX)
2275 .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
2276#endif
2277#endif
2278
2279
2280
2281
2282
2283// common inner routine with file scope
2284//
2285// edge for B upper triangular
2286//
2287// input arguments:
2288// r10d <- k
2289// r11 <- A
2290// r12 <- B
2291// xmm0 <- [d00 d10]
2292// xmm1 <- [d20 d30]
2293// xmm2 <- [d01 d11]
2294// xmm3 <- [d21 d31]
2295// xmm0 <- [d02 d12]
2296// xmm1 <- [d22 d32]
2297// xmm2 <- [d03 d13]
2298// xmm3 <- [d23 d33]
2299// xmm8 <- dirty
2300// xmm9 <- dirty
2301// xmm10 <- dirty
2302// xmm11 <- dirty
2303// xmm12 <- dirty
2304// xmm13 <- dirty
2305// xmm14 <- dirty
2306// xmm15 <- dirty
2307
2308//
2309// output arguments:
2310// r10d <- max(k-4,0)
2311// r11 <- A+4*4*sizeof(double)
2312// r12 <- B+4*4*sizeof(double)
2313// xmm0 <- [d00 d10]
2314// xmm1 <- [d20 d30]
2315// xmm2 <- [d01 d11]
2316// xmm3 <- [d21 d31]
2317// xmm0 <- [d02 d12]
2318// xmm1 <- [d22 d32]
2319// xmm2 <- [d03 d13]
2320// xmm3 <- [d23 d33]
2321// xmm8 <- dirty
2322// xmm9 <- dirty
2323// xmm10 <- dirty
2324// xmm11 <- dirty
2325// xmm12 <- dirty
2326// xmm13 <- dirty
2327// xmm14 <- dirty
2328// xmm15 <- dirty
2329
2330
2331#if MACRO_LEVEL>=1
2332 .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
2333#else
2334 .p2align 4,,15
2335#if defined(OS_LINUX)
2336 .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
2337inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
2338#elif defined(OS_MAC)
2339_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
2340#elif defined(OS_WINDOWS)
2341 .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
2342inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
2343#endif
2344#endif
2345
2346 movapd 0(%r11), %xmm8
2347 movapd 16(%r11), %xmm9
2348 subl $1, %r10d
2349 movddup 0(%r12), %xmm12
2350 movapd %xmm12, %xmm13
2351 mulpd %xmm8, %xmm12
2352 mulpd %xmm9, %xmm13
2353 addpd %xmm12, %xmm0
2354 addpd %xmm13, %xmm4
2355 addq $32, %r11
2356 addq $32, %r12
2357
2358 cmpl $0, %r10d
2359 jle 0f
2360
2361 movapd 0(%r11), %xmm8
2362 movapd 16(%r11), %xmm9
2363 subl $1, %r10d
2364 movddup 0(%r12), %xmm12
2365 movapd %xmm12, %xmm13
2366 mulpd %xmm8, %xmm12
2367 mulpd %xmm9, %xmm13
2368 addpd %xmm12, %xmm0
2369 addpd %xmm13, %xmm4
2370 addq $32, %r11
2371 movddup 8(%r12), %xmm12
2372 movapd %xmm12, %xmm13
2373 mulpd %xmm8, %xmm12
2374 mulpd %xmm9, %xmm13
2375 addpd %xmm12, %xmm1
2376 addpd %xmm13, %xmm5
2377 addq $32, %r12
2378
2379 cmpl $0, %r10d
2380 jle 0f
2381
2382 movapd 0(%r11), %xmm8
2383 movapd 16(%r11), %xmm9
2384 subl $1, %r10d
2385 movddup 0(%r12), %xmm12
2386 movapd %xmm12, %xmm13
2387 mulpd %xmm8, %xmm12
2388 mulpd %xmm9, %xmm13
2389 addpd %xmm12, %xmm0
2390 addpd %xmm13, %xmm4
2391 movddup 8(%r12), %xmm12
2392 movapd %xmm12, %xmm13
2393 mulpd %xmm8, %xmm12
2394 mulpd %xmm9, %xmm13
2395 addpd %xmm12, %xmm1
2396 addpd %xmm13, %xmm5
2397 addq $32, %r11
2398 movddup 16(%r12), %xmm12
2399 movapd %xmm12, %xmm13
2400 mulpd %xmm8, %xmm12
2401 mulpd %xmm9, %xmm13
2402 addpd %xmm12, %xmm2
2403 addpd %xmm13, %xmm6
2404 addq $32, %r12
2405
2406 cmpl $0, %r10d
2407 jle 0f
2408
2409 movapd 0(%r11), %xmm8
2410 movapd 16(%r11), %xmm9
2411 subl $1, %r10d
2412 movddup 0(%r12), %xmm12
2413 movapd %xmm12, %xmm13
2414 mulpd %xmm8, %xmm12
2415 mulpd %xmm9, %xmm13
2416 addpd %xmm12, %xmm0
2417 addpd %xmm13, %xmm4
2418 movddup 8(%r12), %xmm12
2419 movapd %xmm12, %xmm13
2420 mulpd %xmm8, %xmm12
2421 mulpd %xmm9, %xmm13
2422 addpd %xmm12, %xmm1
2423 addpd %xmm13, %xmm5
2424 movddup 16(%r12), %xmm12
2425 movapd %xmm12, %xmm13
2426 mulpd %xmm8, %xmm12
2427 mulpd %xmm9, %xmm13
2428 addpd %xmm12, %xmm2
2429 addpd %xmm13, %xmm6
2430 addq $32, %r11
2431 movddup 24(%r12), %xmm12
2432 movapd %xmm12, %xmm13
2433 mulpd %xmm8, %xmm12
2434 mulpd %xmm9, %xmm13
2435 addpd %xmm12, %xmm3
2436 addpd %xmm13, %xmm7
2437 addq $32, %r12
2438
24390:
2440
2441#if MACRO_LEVEL>=1
2442 .endm
2443#else
2444 ret
2445
2446#if defined(OS_LINUX)
2447 .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
2448#endif
2449#endif
2450
2451
2452
2453
2454
2455// common inner routine with file scope
2456//
2457// blend
2458//
2459// input arguments:
2460// xmm0 <- [d00 d10]
2461// xmm1 <- [d20 d30]
2462// xmm2 <- [d01 d11]
2463// xmm3 <- [d21 d31]
2464// xmm0 <- [d02 d12]
2465// xmm1 <- [d22 d32]
2466// xmm2 <- [d03 d13]
2467// xmm3 <- [d23 d33]
2468// xmm8 <- dirty
2469// xmm9 <- dirty
2470// xmm10 <- dirty
2471// xmm11 <- dirty
2472// xmm12 <- dirty
2473// xmm13 <- dirty
2474// xmm14 <- dirty
2475// xmm15 <- dirty
2476
2477// output arguments:
2478// xmm0 <- [d00 d10]
2479// xmm1 <- [d20 d30]
2480// xmm2 <- [d01 d11]
2481// xmm3 <- [d21 d31]
2482// xmm0 <- [d02 d12]
2483// xmm1 <- [d22 d32]
2484// xmm2 <- [d03 d13]
2485// xmm3 <- [d23 d33]
2486// xmm8 <- dirty
2487// xmm9 <- dirty
2488// xmm10 <- dirty
2489// xmm11 <- dirty
2490// xmm12 <- dirty
2491// xmm13 <- dirty
2492// xmm14 <- dirty
2493// xmm15 <- dirty
2494
2495#if MACRO_LEVEL>=1
2496 .macro INNER_BLEND_4X4_LIB4
2497#else
2498 .p2align 4,,15
2499#if defined(OS_LINUX)
2500 .type inner_blend_4x4_lib4, @function
2501inner_blend_4x4_lib4:
2502#elif defined(OS_MAC)
2503_inner_blend_4x4_lib4:
2504#elif defined(OS_WINDOWS)
2505 .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
2506inner_blend_4x4_lib4:
2507#endif
2508#endif
2509
2510 movapd %xmm0, %xmm8
2511 movsd %xmm1, %xmm0
2512 movsd %xmm8, %xmm1
2513
2514 movapd %xmm2, %xmm8
2515 movsd %xmm3, %xmm2
2516 movsd %xmm8, %xmm3
2517
2518 movapd %xmm4, %xmm8
2519 movsd %xmm5, %xmm4
2520 movsd %xmm8, %xmm5
2521
2522 movapd %xmm6, %xmm8
2523 movsd %xmm7, %xmm6
2524 movsd %xmm8, %xmm7
2525
2526#if MACRO_LEVEL>=1
2527 .endm
2528#else
2529 ret
2530
2531#if defined(OS_LINUX)
2532 .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
2533#endif
2534#endif
2535
2536
2537
2538
2539
2540// common inner routine with file scope
2541//
2542// scale for generic alpha and beta
2543//
2544// input arguments:
2545// r10 <- alpha
2546// r11 <- beta
2547// r12 <- C
2548// xmm0 <- [d00 d10]
2549// xmm1 <- [d20 d30]
2550// xmm2 <- [d01 d11]
2551// xmm3 <- [d21 d31]
2552// xmm0 <- [d02 d12]
2553// xmm1 <- [d22 d32]
2554// xmm2 <- [d03 d13]
2555// xmm3 <- [d23 d33]
2556// xmm8 <- dirty
2557// xmm9 <- dirty
2558// xmm10 <- dirty
2559// xmm11 <- dirty
2560// xmm12 <- dirty
2561// xmm13 <- dirty
2562// xmm14 <- dirty
2563// xmm15 <- dirty
2564
2565// output arguments:
2566// r10 <- alpha
2567// r11 <- beta
2568// r12 <- C
2569// xmm0 <- [d00 d10]
2570// xmm1 <- [d20 d30]
2571// xmm2 <- [d01 d11]
2572// xmm3 <- [d21 d31]
2573// xmm0 <- [d02 d12]
2574// xmm1 <- [d22 d32]
2575// xmm2 <- [d03 d13]
2576// xmm3 <- [d23 d33]
2577// xmm8 <- dirty
2578// xmm9 <- dirty
2579// xmm10 <- dirty
2580// xmm11 <- dirty
2581// xmm12 <- dirty
2582// xmm13 <- dirty
2583// xmm14 <- dirty
2584// xmm15 <- dirty
2585
2586#if MACRO_LEVEL>=1
2587 .macro INNER_SCALE_AB_4X4_LIB4
2588#else
2589 .p2align 4,,15
2590#if defined(OS_LINUX)
2591 .type inner_scale_ab_4x4_lib4, @function
2592inner_scale_ab_4x4_lib4:
2593#elif defined(OS_MAC)
2594_inner_scale_ab_4x4_lib4:
2595#elif defined(OS_WINDOWS)
2596 .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
2597inner_scale_ab_4x4_lib4:
2598#endif
2599#endif
2600
2601 // alpha
2602 movddup 0(%r10), %xmm15
2603
2604 mulpd %xmm15, %xmm0
2605 mulpd %xmm15, %xmm1
2606 mulpd %xmm15, %xmm2
2607 mulpd %xmm15, %xmm3
2608 mulpd %xmm15, %xmm4
2609 mulpd %xmm15, %xmm5
2610 mulpd %xmm15, %xmm6
2611 mulpd %xmm15, %xmm7
2612
2613
2614 // beta
2615 movddup 0(%r11), %xmm14
2616
2617 movapd 0(%r12), %xmm15
2618 mulpd %xmm14, %xmm15
2619 addpd %xmm15, %xmm0
2620 movapd 16(%r12), %xmm15
2621 mulpd %xmm14, %xmm15
2622 addpd %xmm15, %xmm4
2623 movapd 32(%r12), %xmm15
2624 mulpd %xmm14, %xmm15
2625 addpd %xmm15, %xmm1
2626 movapd 48(%r12), %xmm15
2627 mulpd %xmm14, %xmm15
2628 addpd %xmm15, %xmm5
2629 movapd 64(%r12), %xmm15
2630 mulpd %xmm14, %xmm15
2631 addpd %xmm15, %xmm2
2632 movapd 80(%r12), %xmm15
2633 mulpd %xmm14, %xmm15
2634 addpd %xmm15, %xmm6
2635 movapd 96(%r12), %xmm15
2636 mulpd %xmm14, %xmm15
2637 addpd %xmm15, %xmm3
2638 movapd 112(%r12), %xmm15
2639 mulpd %xmm14, %xmm15
2640 addpd %xmm15, %xmm7
2641
2642#if MACRO_LEVEL>=1
2643 .endm
2644#else
2645 ret
2646
2647#if defined(OS_LINUX)
2648 .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
2649#endif
2650#endif
2651
2652
2653
2654
2655
2656// common inner routine with file scope
2657//
2658// scale for generic alpha and beta=0.0
2659//
2660// input arguments:
2661// r10 <- alpha
2662// xmm0 <- [d00 d10]
2663// xmm1 <- [d20 d30]
2664// xmm2 <- [d01 d11]
2665// xmm3 <- [d21 d31]
2666// xmm0 <- [d02 d12]
2667// xmm1 <- [d22 d32]
2668// xmm2 <- [d03 d13]
2669// xmm3 <- [d23 d33]
2670// xmm8 <- dirty
2671// xmm9 <- dirty
2672// xmm10 <- dirty
2673// xmm11 <- dirty
2674// xmm12 <- dirty
2675// xmm13 <- dirty
2676// xmm14 <- dirty
2677// xmm15 <- dirty
2678
2679// output arguments:
2680// r10 <- alpha
2681// xmm0 <- [d00 d10]
2682// xmm1 <- [d20 d30]
2683// xmm2 <- [d01 d11]
2684// xmm3 <- [d21 d31]
2685// xmm0 <- [d02 d12]
2686// xmm1 <- [d22 d32]
2687// xmm2 <- [d03 d13]
2688// xmm3 <- [d23 d33]
2689// xmm8 <- dirty
2690// xmm9 <- dirty
2691// xmm10 <- dirty
2692// xmm11 <- dirty
2693// xmm12 <- dirty
2694// xmm13 <- dirty
2695// xmm14 <- dirty
2696// xmm15 <- dirty
2697
2698#if MACRO_LEVEL>=1
2699 .macro INNER_SCALE_A0_4X4_LIB4
2700#else
2701 .p2align 4,,15
2702#if defined(OS_LINUX)
2703 .type inner_scale_a0_4x4_lib4, @function
2704inner_scale_a0_4x4_lib4:
2705#elif defined(OS_MAC)
2706_inner_scale_a0_4x4_lib4:
2707#elif defined(OS_WINDOWS)
2708 .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
2709inner_scale_a0_4x4_lib4:
2710#endif
2711#endif
2712
2713 // alpha
2714 movddup 0(%r10), %xmm15
2715
2716 mulpd %xmm15, %xmm0
2717 mulpd %xmm15, %xmm1
2718 mulpd %xmm15, %xmm2
2719 mulpd %xmm15, %xmm3
2720 mulpd %xmm15, %xmm4
2721 mulpd %xmm15, %xmm5
2722 mulpd %xmm15, %xmm6
2723 mulpd %xmm15, %xmm7
2724
2725#if MACRO_LEVEL>=1
2726 .endm
2727#else
2728 ret
2729
2730#if defined(OS_LINUX)
2731 .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
2732#endif
2733#endif
2734
2735
2736
2737
2738
2739// common inner routine with file scope
2740//
2741// scale for generic alpha and beta
2742//
2743// input arguments:
2744// r10 <- alpha
2745// r11 <- beta
2746// r12 <- C
2747// xmm0 <- [d00 d10]
2748// xmm1 <- [d20 d30]
2749// xmm2 <- [d01 d11]
2750// xmm3 <- [d21 d31]
2751// xmm0 <- [d02 d12]
2752// xmm1 <- [d22 d32]
2753// xmm2 <- [d03 d13]
2754// xmm3 <- [d23 d33]
2755// xmm8 <- dirty
2756// xmm9 <- dirty
2757// xmm10 <- dirty
2758// xmm11 <- dirty
2759// xmm12 <- dirty
2760// xmm13 <- dirty
2761// xmm14 <- dirty
2762// xmm15 <- dirty
2763
2764// output arguments:
2765// r10 <- alpha
2766// r11 <- beta
2767// r12 <- C
2768// xmm0 <- [d00 d10]
2769// xmm1 <- [d20 d30]
2770// xmm2 <- [d01 d11]
2771// xmm3 <- [d21 d31]
2772// xmm0 <- [d02 d12]
2773// xmm1 <- [d22 d32]
2774// xmm2 <- [d03 d13]
2775// xmm3 <- [d23 d33]
2776// xmm8 <- dirty
2777// xmm9 <- dirty
2778// xmm10 <- dirty
2779// xmm11 <- dirty
2780// xmm12 <- dirty
2781// xmm13 <- dirty
2782// xmm14 <- dirty
2783// xmm15 <- dirty
2784
2785#if MACRO_LEVEL>=1
2786 .macro INNER_BLEND_SCALE_AB_4X4_LIB4
2787#else
2788 .p2align 4,,15
2789#if defined(OS_LINUX)
2790 .type inner_blend_scale_ab_4x4_lib4, @function
2791inner_blend_scale_ab_4x4_lib4:
2792#elif defined(OS_MAC)
2793_inner_blend_scale_ab_4x4_lib4:
2794#elif defined(OS_WINDOWS)
2795 .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
2796inner_blend_scale_ab_4x4_lib4:
2797#endif
2798#endif
2799
2800 movapd %xmm0, %xmm8
2801 movsd %xmm1, %xmm0
2802 movsd %xmm8, %xmm1
2803
2804 movapd %xmm2, %xmm8
2805 movsd %xmm3, %xmm2
2806 movsd %xmm8, %xmm3
2807
2808 movapd %xmm4, %xmm8
2809 movsd %xmm5, %xmm4
2810 movsd %xmm8, %xmm5
2811
2812 movapd %xmm6, %xmm8
2813 movsd %xmm7, %xmm6
2814 movsd %xmm8, %xmm7
2815
2816 // alpha
2817 movddup 0(%r10), %xmm15
2818
2819 mulpd %xmm15, %xmm0
2820 mulpd %xmm15, %xmm1
2821 mulpd %xmm15, %xmm2
2822 mulpd %xmm15, %xmm3
2823 mulpd %xmm15, %xmm4
2824 mulpd %xmm15, %xmm5
2825 mulpd %xmm15, %xmm6
2826 mulpd %xmm15, %xmm7
2827
2828
2829 // beta
2830 movddup 0(%r11), %xmm14
2831
2832 movapd 0(%r12), %xmm15
2833 mulpd %xmm14, %xmm15
2834 addpd %xmm15, %xmm0
2835 movapd 16(%r12), %xmm15
2836 mulpd %xmm14, %xmm15
2837 addpd %xmm15, %xmm4
2838 movapd 32(%r12), %xmm15
2839 mulpd %xmm14, %xmm15
2840 addpd %xmm15, %xmm1
2841 movapd 48(%r12), %xmm15
2842 mulpd %xmm14, %xmm15
2843 addpd %xmm15, %xmm5
2844 movapd 64(%r12), %xmm15
2845 mulpd %xmm14, %xmm15
2846 addpd %xmm15, %xmm2
2847 movapd 80(%r12), %xmm15
2848 mulpd %xmm14, %xmm15
2849 addpd %xmm15, %xmm6
2850 movapd 96(%r12), %xmm15
2851 mulpd %xmm14, %xmm15
2852 addpd %xmm15, %xmm3
2853 movapd 112(%r12), %xmm15
2854 mulpd %xmm14, %xmm15
2855 addpd %xmm15, %xmm7
2856
2857#if MACRO_LEVEL>=1
2858 .endm
2859#else
2860 ret
2861
2862#if defined(OS_LINUX)
2863 .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
2864#endif
2865#endif
2866
2867
2868
2869
2870
2871// common inner routine with file scope
2872//
2873// blender for alpha = 1.0 and beta = 1.0
2874//
2875// input arguments:
2876// r10 <- C
2877// xmm0 <- [d00 d10]
2878// xmm1 <- [d20 d30]
2879// xmm2 <- [d01 d11]
2880// xmm3 <- [d21 d31]
2881// xmm0 <- [d02 d12]
2882// xmm1 <- [d22 d32]
2883// xmm2 <- [d03 d13]
2884// xmm3 <- [d23 d33]
2885// xmm8 <- dirty
2886// xmm9 <- dirty
2887// xmm10 <- dirty
2888// xmm11 <- dirty
2889// xmm12 <- dirty
2890// xmm13 <- dirty
2891// xmm14 <- dirty
2892// xmm15 <- dirty
2893
2894// output arguments:
2895// r10 <- C
2896// xmm0 <- [d00 d10]
2897// xmm1 <- [d20 d30]
2898// xmm2 <- [d01 d11]
2899// xmm3 <- [d21 d31]
2900// xmm0 <- [d02 d12]
2901// xmm1 <- [d22 d32]
2902// xmm2 <- [d03 d13]
2903// xmm3 <- [d23 d33]
2904// xmm8 <- dirty
2905// xmm9 <- dirty
2906// xmm10 <- dirty
2907// xmm11 <- dirty
2908// xmm12 <- dirty
2909// xmm13 <- dirty
2910// xmm14 <- dirty
2911// xmm15 <- dirty
2912
2913#if MACRO_LEVEL>=1
2914 .macro INNER_BLEND_SCALE_11_4X4_LIB4
2915#else
2916 .p2align 4,,15
2917#if defined(OS_LINUX)
2918 .type inner_blend_scale_11_4x4_lib4, @function
2919inner_blend_scale_11_4x4_lib4:
2920#elif defined(OS_MAC)
2921_inner_blend_scale_11_4x4_lib4:
2922#elif defined(OS_WINDOWS)
2923 .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
2924inner_blend_scale_11_4x4_lib4:
2925#endif
2926#endif
2927
2928 movapd %xmm0, %xmm8
2929 movsd %xmm1, %xmm0
2930 movsd %xmm8, %xmm1
2931
2932 movapd %xmm2, %xmm8
2933 movsd %xmm3, %xmm2
2934 movsd %xmm8, %xmm3
2935
2936 movapd %xmm4, %xmm8
2937 movsd %xmm5, %xmm4
2938 movsd %xmm8, %xmm5
2939
2940 movapd %xmm6, %xmm8
2941 movsd %xmm7, %xmm6
2942 movsd %xmm8, %xmm7
2943
2944
2945 movapd 0(%r10), %xmm15
2946 addpd %xmm15, %xmm0
2947 movapd 16(%r10), %xmm15
2948 addpd %xmm15, %xmm4
2949 movapd 32(%r10), %xmm15
2950 addpd %xmm15, %xmm1
2951 movapd 48(%r10), %xmm15
2952 addpd %xmm15, %xmm5
2953 movapd 64(%r10), %xmm15
2954 addpd %xmm15, %xmm2
2955 movapd 80(%r10), %xmm15
2956 addpd %xmm15, %xmm6
2957 movapd 96(%r10), %xmm15
2958 addpd %xmm15, %xmm3
2959 movapd 112(%r10), %xmm15
2960 addpd %xmm15, %xmm7
2961
2962 ret
2963
2964#if MACRO_LEVEL>=1
2965 .endm
2966#else
2967#if defined(OS_LINUX)
2968 .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
2969#endif
2970#endif
2971
2972
2973
2974
2975
2976// common inner routine with file scope
2977//
2978// cholesky factorization
2979//
2980// input arguments:
2981// r10 <- inv_diag_E
2982// r11d <- kn
2983// xmm0 <- [d00 d10]
2984// xmm1 <- [d20 d30]
2985// xmm2 <- [d01 d11]
2986// xmm3 <- [d21 d31]
2987// xmm0 <- [d02 d12]
2988// xmm1 <- [d22 d32]
2989// xmm2 <- [d03 d13]
2990// xmm3 <- [d23 d33]
2991// xmm8 <- dirty
2992// xmm9 <- dirty
2993// xmm10 <- dirty
2994// xmm11 <- dirty
2995// xmm12 <- dirty
2996// xmm13 <- dirty
2997// xmm14 <- dirty
2998// xmm15 <- dirty
2999//
3000// output arguments:
3001// r10 <- inv_diag_E
3002// r11d <- kn
3003// xmm0 <- [d00 d10]
3004// xmm1 <- [d20 d30]
3005// xmm2 <- [d01 d11]
3006// xmm3 <- [d21 d31]
3007// xmm0 <- [d02 d12]
3008// xmm1 <- [d22 d32]
3009// xmm2 <- [d03 d13]
3010// xmm3 <- [d23 d33]
3011// xmm8 <- dirty
3012// xmm9 <- dirty
3013// xmm10 <- dirty
3014// xmm11 <- dirty
3015// xmm12 <- dirty
3016// xmm13 <- dirty
3017// xmm14 <- dirty
3018// xmm15 <- dirty
3019
3020#if MACRO_LEVEL>=1
3021 .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
3022#else
3023 .p2align 4,,15
3024#if defined(OS_LINUX)
3025 .type inner_edge_dpotrf_4x4_vs_lib4, @function
3026inner_edge_dpotrf_4x4_vs_lib4:
3027#elif defined(OS_MAC)
3028_inner_edge_dpotrf_4x4_vs_lib4:
3029#elif defined(OS_WINDOWS)
3030 .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
3031inner_edge_dpotrf_4x4_vs_lib4:
3032#endif
3033#endif
3034
3035 xorpd %xmm15, %xmm15 // 0.0
3036
3037 movsd %xmm0, %xmm13
3038 ucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
3039 jbe 1f
3040 sqrtsd %xmm13, %xmm13
3041#if defined(OS_LINUX) | defined(OS_WINDOWS)
3042 movsd .LC04(%rip), %xmm12 // 1.0
3043#elif defined(OS_MAC)
3044 movsd LC04(%rip), %xmm12 // 1.0
3045#endif
3046 divsd %xmm13, %xmm12
30472:
3048 cmpl $2, %r11d
3049 movsd %xmm12, 0(%r10)
3050 movddup %xmm12, %xmm12
3051 mulpd %xmm12, %xmm0
3052 mulpd %xmm12, %xmm4
3053
3054 jl 0f // ret
3055
3056 movapd %xmm0, %xmm12
3057 shufpd $0x3, %xmm12, %xmm12
3058 movapd %xmm12, %xmm13
3059 mulpd %xmm0, %xmm12
3060 mulpd %xmm4, %xmm13
3061 subpd %xmm12, %xmm1
3062 subpd %xmm13, %xmm5
3063 movapd %xmm1, %xmm13
3064 shufpd $0x3, %xmm13, %xmm13 // 0x1 ???
3065 ucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
3066 jbe 3f
3067 sqrtsd %xmm13, %xmm13
3068#if defined(OS_LINUX) | defined(OS_WINDOWS)
3069 movsd .LC04(%rip), %xmm12 // 1.0
3070#elif defined(OS_MAC)
3071 movsd LC04(%rip), %xmm12 // 1.0
3072#endif
3073 divsd %xmm13, %xmm12
30744:
3075 cmpl $3, %r11d
3076 movsd %xmm12, 8(%r10)
3077 movddup %xmm12, %xmm12
3078 mulpd %xmm12, %xmm1
3079 mulpd %xmm12, %xmm5
3080
3081 jl 0f // ret
3082
3083 movddup %xmm4, %xmm12
3084 movddup %xmm5, %xmm13
3085 mulpd %xmm4, %xmm12
3086 mulpd %xmm5, %xmm13
3087 subpd %xmm12, %xmm6
3088 subpd %xmm13, %xmm6
3089 movsd %xmm6, %xmm13
3090 ucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
3091 jbe 5f
3092 sqrtsd %xmm13, %xmm13
3093#if defined(OS_LINUX) | defined(OS_WINDOWS)
3094 movsd .LC04(%rip), %xmm12 // 1.0
3095#elif defined(OS_MAC)
3096 movsd LC04(%rip), %xmm12 // 1.0
3097#endif
3098 divsd %xmm13, %xmm12
30996:
3100 cmpl $4, %r11d
3101 movsd %xmm12, 16(%r10)
3102 movddup %xmm12, %xmm12
3103 mulpd %xmm12, %xmm6
3104
3105 jl 0f // ret
3106
3107 movapd %xmm4, %xmm12
3108 movapd %xmm5, %xmm13
3109 movapd %xmm6, %xmm14
3110 shufpd $0x3, %xmm12, %xmm12
3111 shufpd $0x3, %xmm13, %xmm13
3112 shufpd $0x3, %xmm14, %xmm14
3113 mulpd %xmm4, %xmm12
3114 mulpd %xmm5, %xmm13
3115 mulpd %xmm6, %xmm14
3116 subpd %xmm12, %xmm7
3117 subpd %xmm13, %xmm7
3118 subpd %xmm14, %xmm7
3119 movapd %xmm7, %xmm13
3120 shufpd $0x3, %xmm13, %xmm13
3121 ucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
3122 jbe 7f
3123 sqrtsd %xmm13, %xmm13
3124#if defined(OS_LINUX) | defined(OS_WINDOWS)
3125 movsd .LC04(%rip), %xmm12 // 1.0
3126#elif defined(OS_MAC)
3127 movsd LC04(%rip), %xmm12 // 1.0
3128#endif
3129 divsd %xmm13, %xmm12
31308:
3131 movsd %xmm12, 24(%r10)
3132 movddup %xmm12, %xmm12
3133 mulpd %xmm12, %xmm7
3134
3135 jmp 0f
3136
31371:
3138 xorpd %xmm12, %xmm12
3139 jmp 2b
3140
31413:
3142 xorpd %xmm12, %xmm12
3143 jmp 4b
3144
31455:
3146 xorpd %xmm12, %xmm12
3147 jmp 6b
3148
31497:
3150 xorpd %xmm12, %xmm12
3151 jmp 8b
3152
31530:
3154
3155#if MACRO_LEVEL>=1
3156 .endm
3157#else
3158 ret
3159
3160#if defined(OS_LINUX)
3161 .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
3162#endif
3163#endif
3164
3165
3166
3167
3168
3169// common inner routine with file scope
3170//
3171// triangular substitution for cholesky factorization
3172//
3173// input arguments:
3174// r10 <- E
3175// r11 <- inv_diag_E
3176// xmm0 <- [d00 d10]
3177// xmm1 <- [d20 d30]
3178// xmm2 <- [d01 d11]
3179// xmm3 <- [d21 d31]
3180// xmm0 <- [d02 d12]
3181// xmm1 <- [d22 d32]
3182// xmm2 <- [d03 d13]
3183// xmm3 <- [d23 d33]
3184// xmm8 <- dirty
3185// xmm9 <- dirty
3186// xmm10 <- dirty
3187// xmm11 <- dirty
3188// xmm12 <- dirty
3189// xmm13 <- dirty
3190// xmm14 <- dirty
3191// xmm15 <- dirty
3192//
3193// output arguments:
3194// r10 <- E
3195// r11 <- inv_diag_E
3196// xmm0 <- [d00 d10]
3197// xmm1 <- [d20 d30]
3198// xmm2 <- [d01 d11]
3199// xmm3 <- [d21 d31]
3200// xmm0 <- [d02 d12]
3201// xmm1 <- [d22 d32]
3202// xmm2 <- [d03 d13]
3203// xmm3 <- [d23 d33]
3204// xmm8 <- dirty
3205// xmm9 <- dirty
3206// xmm10 <- dirty
3207// xmm11 <- dirty
3208// xmm12 <- dirty
3209// xmm13 <- dirty
3210// xmm14 <- dirty
3211// xmm15 <- dirty
3212
3213#if MACRO_LEVEL>=1
3214 .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
3215#else
3216 .p2align 4,,15
3217#if defined(OS_LINUX)
3218 .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
3219inner_edge_dtrsm_rlt_inv_4x4_lib4:
3220#elif defined(OS_MAC)
3221_inner_edge_dtrsm_rlt_inv_4x4_lib4:
3222#elif defined(OS_WINDOWS)
3223 .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
3224inner_edge_dtrsm_rlt_inv_4x4_lib4:
3225#endif
3226#endif
3227
3228 movddup 0(%r11), %xmm13
3229 mulpd %xmm13, %xmm0
3230 mulpd %xmm13, %xmm4
3231
3232 movddup 8(%r10), %xmm13
3233 movapd %xmm13, %xmm12
3234 mulpd %xmm0, %xmm13
3235 mulpd %xmm4, %xmm12
3236 subpd %xmm13, %xmm1
3237 subpd %xmm12, %xmm5
3238 movddup 8(%r11), %xmm13
3239 mulpd %xmm13, %xmm1
3240 mulpd %xmm13, %xmm5
3241
3242 movddup 16(%r10), %xmm13
3243 movapd %xmm13, %xmm12
3244 mulpd %xmm0, %xmm12
3245 mulpd %xmm4, %xmm13
3246 subpd %xmm12, %xmm2
3247 subpd %xmm13, %xmm6
3248 movddup 48(%r10), %xmm13
3249 movapd %xmm13, %xmm12
3250 mulpd %xmm1, %xmm12
3251 mulpd %xmm5, %xmm13
3252 subpd %xmm12, %xmm2
3253 subpd %xmm13, %xmm6
3254 movddup 16(%r11), %xmm13
3255 mulpd %xmm13, %xmm2
3256 mulpd %xmm13, %xmm6
3257
3258 movddup 24(%r10), %xmm13
3259 movapd %xmm13, %xmm12
3260 mulpd %xmm0, %xmm12
3261 mulpd %xmm4, %xmm13
3262 subpd %xmm12, %xmm3
3263 subpd %xmm13, %xmm7
3264 movddup 56(%r10), %xmm13
3265 movapd %xmm13, %xmm12
3266 mulpd %xmm1, %xmm12
3267 mulpd %xmm5, %xmm13
3268 subpd %xmm12, %xmm3
3269 subpd %xmm13, %xmm7
3270 movddup 88(%r10), %xmm13
3271 movapd %xmm13, %xmm12
3272 mulpd %xmm2, %xmm12
3273 mulpd %xmm6, %xmm13
3274 subpd %xmm12, %xmm3
3275 subpd %xmm13, %xmm7
3276 movddup 24(%r11), %xmm13
3277 mulpd %xmm13, %xmm3
3278 mulpd %xmm13, %xmm7
3279
3280#if MACRO_LEVEL>=1
3281 .endm
3282#else
3283 ret
3284
3285#if defined(OS_LINUX)
3286 .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
3287#endif
3288#endif
3289
3290
3291
3292
3293
3294// common inner routine with file scope
3295//
3296// triangular substitution for cholesky factorization
3297//
3298// input arguments:
3299// r10 <- D
3300// r11 <- inv_diag_D
3301// r12d <- kn
3302// xmm0 <- [d00 d10]
3303// xmm1 <- [d20 d30]
3304// xmm2 <- [d01 d11]
3305// xmm3 <- [d21 d31]
3306// xmm0 <- [d02 d12]
3307// xmm1 <- [d22 d32]
3308// xmm2 <- [d03 d13]
3309// xmm3 <- [d23 d33]
3310// xmm8 <- dirty
3311// xmm9 <- dirty
3312// xmm10 <- dirty
3313// xmm11 <- dirty
3314// xmm12 <- dirty
3315// xmm13 <- dirty
3316// xmm14 <- dirty
3317// xmm15 <- dirty
3318//
3319// output arguments:
3320// r10 <- D
3321// r11 <- inv_diag_D
3322// r12d <- kn
3323// xmm0 <- [d00 d10]
3324// xmm1 <- [d20 d30]
3325// xmm2 <- [d01 d11]
3326// xmm3 <- [d21 d31]
3327// xmm0 <- [d02 d12]
3328// xmm1 <- [d22 d32]
3329// xmm2 <- [d03 d13]
3330// xmm3 <- [d23 d33]
3331// xmm8 <- dirty
3332// xmm9 <- dirty
3333// xmm10 <- dirty
3334// xmm11 <- dirty
3335// xmm12 <- dirty
3336// xmm13 <- dirty
3337// xmm14 <- dirty
3338// xmm15 <- dirty
3339
3340#if MACRO_LEVEL>=1
3341 .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
3342#else
3343 .p2align 4,,15
3344#if defined(OS_LINUX)
3345 .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
3346inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
3347#elif defined(OS_MAC)
3348_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
3349#elif defined(OS_WINDOWS)
3350 .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
3351inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
3352#endif
3353#endif
3354
3355 movddup 0(%r11), %xmm13
3356 cmpl $2, %r12d
3357 mulpd %xmm13, %xmm0
3358 mulpd %xmm13, %xmm4
3359
3360 jl 0f // ret
3361
3362 movddup 8(%r10), %xmm13
3363 cmpl $3, %r12d
3364 movapd %xmm13, %xmm12
3365 mulpd %xmm0, %xmm13
3366 mulpd %xmm4, %xmm12
3367 subpd %xmm13, %xmm1
3368 subpd %xmm12, %xmm5
3369 movddup 8(%r11), %xmm13
3370 mulpd %xmm13, %xmm1
3371 mulpd %xmm13, %xmm5
3372
3373 jl 0f // ret
3374
3375 movddup 16(%r10), %xmm13
3376 cmpl $4, %r12d
3377 movapd %xmm13, %xmm12
3378 mulpd %xmm0, %xmm12
3379 mulpd %xmm4, %xmm13
3380 subpd %xmm12, %xmm2
3381 subpd %xmm13, %xmm6
3382 movddup 48(%r10), %xmm13
3383 movapd %xmm13, %xmm12
3384 mulpd %xmm1, %xmm12
3385 mulpd %xmm5, %xmm13
3386 subpd %xmm12, %xmm2
3387 subpd %xmm13, %xmm6
3388 movddup 16(%r11), %xmm13
3389 mulpd %xmm13, %xmm2
3390 mulpd %xmm13, %xmm6
3391
3392 jl 0f // ret
3393
3394 movddup 24(%r10), %xmm13
3395 movapd %xmm13, %xmm12
3396 mulpd %xmm0, %xmm12
3397 mulpd %xmm4, %xmm13
3398 subpd %xmm12, %xmm3
3399 subpd %xmm13, %xmm7
3400 movddup 56(%r10), %xmm13
3401 movapd %xmm13, %xmm12
3402 mulpd %xmm1, %xmm12
3403 mulpd %xmm5, %xmm13
3404 subpd %xmm12, %xmm3
3405 subpd %xmm13, %xmm7
3406 movddup 88(%r10), %xmm13
3407 movapd %xmm13, %xmm12
3408 mulpd %xmm2, %xmm12
3409 mulpd %xmm6, %xmm13
3410 subpd %xmm12, %xmm3
3411 subpd %xmm13, %xmm7
3412 movddup 24(%r11), %xmm13
3413 mulpd %xmm13, %xmm3
3414 mulpd %xmm13, %xmm7
3415
34160:
3417
3418#if MACRO_LEVEL>=1
3419 .endm
3420#else
3421 ret
3422
3423#if defined(OS_LINUX)
3424 .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
3425#endif
3426#endif
3427
3428
3429
3430
3431
3432// common inner routine with file scope
3433//
3434// store n
3435//
3436// input arguments:
3437// r10 <- D
3438// xmm0 <- [d00 d10]
3439// xmm1 <- [d20 d30]
3440// xmm2 <- [d01 d11]
3441// xmm3 <- [d21 d31]
3442// xmm0 <- [d02 d12]
3443// xmm1 <- [d22 d32]
3444// xmm2 <- [d03 d13]
3445// xmm3 <- [d23 d33]
3446// xmm8 <- dirty
3447// xmm9 <- dirty
3448// xmm10 <- dirty
3449// xmm11 <- dirty
3450// xmm12 <- dirty
3451// xmm13 <- dirty
3452// xmm14 <- dirty
3453// xmm15 <- dirty
3454//
3455// output arguments:
3456// r10 <- D
3457// xmm0 <- [d00 d10]
3458// xmm1 <- [d01 d11]
3459// xmm2 <- [d02 d12]
3460// xmm3 <- [d03 d13]
3461// xmm4 <- [d20 d30]
3462// xmm5 <- [d21 d31]
3463// xmm6 <- [d22 d32]
3464// xmm7 <- [d23 d33]
3465// xmm8 <- dirty
3466// xmm9 <- dirty
3467// xmm10 <- dirty
3468// xmm11 <- dirty
3469// xmm12 <- dirty
3470// xmm13 <- dirty
3471// xmm14 <- dirty
3472// xmm15 <- dirty
3473
3474#if MACRO_LEVEL>=1
3475 .macro INNER_STORE_4X4_LIB4
3476#else
3477 .p2align 4,,15
3478#if defined(OS_LINUX)
3479 .type inner_store_4x4_lib4, @function
3480inner_store_4x4_lib4:
3481#elif defined(OS_MAC)
3482_inner_store_4x4_lib4:
3483#elif defined(OS_WINDOWS)
3484 .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
3485inner_store_4x4_lib4:
3486#endif
3487#endif
3488
3489 movapd %xmm0, 0(%r10)
3490 movapd %xmm4, 16(%r10)
3491 movapd %xmm1, 32(%r10)
3492 movapd %xmm5, 48(%r10)
3493 movapd %xmm2, 64(%r10)
3494 movapd %xmm6, 80(%r10)
3495 movapd %xmm3, 96(%r10)
3496 movapd %xmm7, 112(%r10)
3497
3498#if MACRO_LEVEL>=1
3499 .endm
3500#else
3501 ret
3502
3503#if defined(OS_LINUX)
3504 .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
3505#endif
3506#endif
3507
3508
3509
3510
3511
3512// common inner routine with file scope
3513//
3514// store n vs
3515//
3516// input arguments:
3517// r10 <- D
3518// r11d <- km
3519// r12d <- kn
3520// xmm0 <- [d00 d10]
3521// xmm1 <- [d20 d30]
3522// xmm2 <- [d01 d11]
3523// xmm3 <- [d21 d31]
3524// xmm0 <- [d02 d12]
3525// xmm1 <- [d22 d32]
3526// xmm2 <- [d03 d13]
3527// xmm3 <- [d23 d33]
3528// xmm8 <- dirty
3529// xmm9 <- dirty
3530// xmm10 <- dirty
3531// xmm11 <- dirty
3532// xmm12 <- dirty
3533// xmm13 <- dirty
3534// xmm14 <- dirty
3535// xmm15 <- dirty
3536//
3537// output arguments:
3538// r10 <- D
3539// r11d <- km
3540// r12d <- kn
3541// xmm0 <- [d00 d10]
3542// xmm1 <- [d20 d30]
3543// xmm2 <- [d01 d11]
3544// xmm3 <- [d21 d31]
3545// xmm0 <- [d02 d12]
3546// xmm1 <- [d22 d32]
3547// xmm2 <- [d03 d13]
3548// xmm3 <- [d23 d33]
3549// xmm8 <- dirty
3550// xmm9 <- dirty
3551// xmm10 <- dirty
3552// xmm11 <- dirty
3553// xmm12 <- dirty
3554// xmm13 <- dirty
3555// xmm14 <- dirty
3556// xmm15 <- dirty
3557
3558#if MACRO_LEVEL>=1
3559 .macro INNER_STORE_4X4_VS_LIB4
3560#else
3561 .p2align 4,,15
3562#if defined(OS_LINUX)
3563 .type inner_store_4x4_vs_lib4, @function
3564inner_store_4x4_vs_lib4:
3565#elif defined(OS_MAC)
3566_inner_store_4x4_vs_lib4:
3567#elif defined(OS_WINDOWS)
3568 .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
3569inner_store_4x4_vs_lib4:
3570#endif
3571#endif
3572
3573 cmpl $2, %r11d
3574 jg 1f
3575 je 0f
3576
3577 // km==1
3578 movsd %xmm0, 0(%r10)
3579 cmpl $2, %r12d
3580 jl 4f // end
3581 movsd %xmm1, 32(%r10)
3582 cmpl $3, %r12d
3583 jl 4f // end
3584 movsd %xmm2, 64(%r10)
3585 je 4f // end
3586 movsd %xmm3, 96(%r10)
3587
3588 jmp 4f
3589
35900:
3591 // km==2
3592 movapd %xmm0, 0(%r10)
3593 cmpl $2, %r12d
3594 jl 4f // end
3595 movapd %xmm1, 32(%r10)
3596 cmpl $3, %r12d
3597 jl 4f // end
3598 movapd %xmm2, 64(%r10)
3599 je 4f // end
3600 movapd %xmm3, 96(%r10)
3601
3602 jmp 4f
3603
36041:
3605 cmpl $3, %r11d
3606 jg 2f
3607
3608 // km==3
3609 movapd %xmm0, 0(%r10)
3610 movsd %xmm4, 16(%r10)
3611 cmpl $2, %r12d
3612 jl 4f // end
3613 movapd %xmm1, 32(%r10)
3614 movsd %xmm5, 48(%r10)
3615 cmpl $3, %r12d
3616 jl 4f // end
3617 movapd %xmm2, 64(%r10)
3618 movsd %xmm6, 80(%r10)
3619 je 4f // end
3620 movapd %xmm3, 96(%r10)
3621 movsd %xmm7, 112(%r10)
3622
3623 jmp 4f
3624
36252:
3626 // km==4
3627 movapd %xmm0, 0(%r10)
3628 movapd %xmm4, 16(%r10)
3629 cmpl $2, %r12d
3630 jl 4f // end
3631 movapd %xmm1, 32(%r10)
3632 movapd %xmm5, 48(%r10)
3633 cmpl $3, %r12d
3634 jl 4f // end
3635 movapd %xmm2, 64(%r10)
3636 movapd %xmm6, 80(%r10)
3637 je 4f // end
3638 movapd %xmm3, 96(%r10)
3639 movapd %xmm7, 112(%r10)
3640
36414:
3642
3643#if MACRO_LEVEL>=1
3644 .endm
3645#else
3646 ret
3647
3648#if defined(OS_LINUX)
3649 .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
3650#endif
3651#endif
3652
3653
3654
3655
3656
3657// common inner routine with file scope
3658//
3659// store n generalized
3660//
3661// input arguments:
3662// r10 <- offset
3663// r11 <- D
3664// r12 <- 4*sdd*sizeof(double)
3665// r13 <- m0 // row index: start from (inc)
3666// r14 <- m1 // row index: up to (exc)
3667// r15 <- n0 // col index: start from (inc)
3668// rax <- n1 // col index: up to (exc)
3669// rbx <- dirty
3670// xmm0 <-
3671//
3672// output arguments:
3673// r10 <- offset
3674// r11 <- D
3675// r12 <- 4*sdd*sizeof(double)
3676// r13 <- m0 // row index: start from (inc)
3677// r14 <- m1 // row index: up to (exc)
3678// r15 <- n1-n0
3679// rax <- n1-n0
3680// rbx <- dirty
3681// xmm0 <-
3682
3683#if MACRO_LEVEL>=1
3684 .macro INNER_STORE_4X4_GEN_LIB4
3685#else
3686 .p2align 4,,15
3687#if defined(OS_LINUX)
3688 .type inner_store_4x4_gen_lib4, @function
3689inner_store_4x4_gen_lib4:
3690#elif defined(OS_MAC)
3691_inner_store_4x4_gen_lib4:
3692#elif defined(OS_WINDOWS)
3693 .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
3694inner_store_4x4_gen_lib4:
3695#endif
3696#endif
3697
3698 // masks computation ???
3699
3700 // shift D and sol for cols
3701 cmpl $0, %r15d
3702 jle 0f
3703
3704 vmovapd %xmm1, %xmm0
3705 vmovapd %xmm5, %xmm4
3706 vmovapd %xmm2, %xmm1
3707 vmovapd %xmm6, %xmm5
3708 vmovapd %xmm3, %xmm2
3709 vmovapd %xmm7, %xmm6
3710 addq $32, %r11
3711
3712 cmpl $1, %r15d
3713 jle 0f
3714
3715 vmovapd %xmm1, %xmm0
3716 vmovapd %xmm5, %xmm4
3717 vmovapd %xmm2, %xmm1
3718 vmovapd %xmm6, %xmm5
3719 addq $32, %r11
3720
3721 cmpl $2, %r15d
3722 jle 0f
3723
3724 vmovapd %xmm1, %xmm0
3725 vmovapd %xmm5, %xmm4
3726 addq $32, %r11
3727
37280:
3729
3730 // compute number of cols
3731 cmpl $4, %eax
3732 jle 0f
3733 movl $4, %eax
37340:
3735 subl %r15d, %eax
3736 movl %eax, %r15d
3737
3738
3739 cmpl $0, %r10d
3740 jg 0f
3741
3742 ///////////////
3743 // offset==0 //
3744 ///////////////
3745
3746 cmpl $0, %r13d
3747 jle 4f
3748
3749 cmpl $1, %r13d
3750 jg 5f
3751
3752 movsd 0(%r11), %xmm8
3753 movsd %xmm8, %xmm0
3754 movsd 32(%r11), %xmm8
3755 movsd %xmm8, %xmm1
3756 movsd 64(%r11), %xmm8
3757 movsd %xmm8, %xmm2
3758 movsd 96(%r11), %xmm8
3759 movsd %xmm8, %xmm3
3760
3761 jmp 4f
3762
37635:
3764
3765 cmpl $2, %r13d
3766 jg 5f
3767
3768 movapd 0(%r11), %xmm0
3769 movapd 32(%r11), %xmm1
3770 movapd 64(%r11), %xmm2
3771 movapd 96(%r11), %xmm3
3772
3773 jmp 4f
3774
37755:
3776
3777 cmpl $3, %r13d
3778 jg 5f
3779
3780 movapd 0(%r11), %xmm0
3781 movsd 16(%r11), %xmm8
3782 movsd %xmm8, %xmm4
3783 movapd 32(%r11), %xmm1
3784 movsd 48(%r11), %xmm8
3785 movsd %xmm8, %xmm5
3786 movapd 64(%r11), %xmm2
3787 movsd 80(%r11), %xmm8
3788 movsd %xmm8, %xmm6
3789 movapd 96(%r11), %xmm3
3790 movsd 112(%r11), %xmm8
3791 movsd %xmm8, %xmm7
3792
3793 jmp 4f
3794
37955:
3796
3797 movapd 0(%r11), %xmm0
3798 movapd 16(%r11), %xmm4
3799 movapd 32(%r11), %xmm1
3800 movapd 48(%r11), %xmm5
3801 movapd 64(%r11), %xmm2
3802 movapd 80(%r11), %xmm6
3803 movapd 96(%r11), %xmm3
3804 movapd 112(%r11), %xmm7
3805
38064:
3807 cmpl $2, %r14d
3808 jg 5f
3809 je 4f
3810
3811 // km==1
3812 movsd %xmm0, 0(%r11)
3813 cmpl $2, %r15d
3814 jl 3f // end
3815 movsd %xmm1, 32(%r11)
3816 cmpl $3, %r15d
3817 jl 3f // end
3818 movsd %xmm2, 64(%r11)
3819 je 3f // end
3820 movsd %xmm3, 96(%r11)
3821
3822 jmp 3f
3823
38244:
3825 // km==2
3826 movapd %xmm0, 0(%r11)
3827 cmpl $2, %r15d
3828 jl 3f // end
3829 movapd %xmm1, 32(%r11)
3830 cmpl $3, %r15d
3831 jl 3f // end
3832 movapd %xmm2, 64(%r11)
3833 je 3f // end
3834 movapd %xmm3, 96(%r11)
3835
3836 jmp 3f
3837
38385:
3839 cmpl $3, %r14d
3840 jg 6f
3841
3842 // km==3
3843 movapd %xmm0, 0(%r11)
3844 movsd %xmm4, 16(%r11)
3845 cmpl $2, %r15d
3846 jl 3f // end
3847 movapd %xmm1, 32(%r11)
3848 movsd %xmm5, 48(%r11)
3849 cmpl $3, %r15d
3850 jl 3f // end
3851 movapd %xmm2, 64(%r11)
3852 movsd %xmm6, 80(%r11)
3853 je 3f // end
3854 movapd %xmm3, 96(%r11)
3855 movsd %xmm7, 112(%r11)
3856
3857 jmp 3f
3858
38596:
3860 // km==4
3861 movapd %xmm0, 0(%r11)
3862 movapd %xmm4, 16(%r11)
3863 cmpl $2, %r15d
3864 jl 3f // end
3865 movapd %xmm1, 32(%r11)
3866 movapd %xmm5, 48(%r11)
3867 cmpl $3, %r15d
3868 jl 3f // end
3869 movapd %xmm2, 64(%r11)
3870 movapd %xmm6, 80(%r11)
3871 je 3f // end
3872 movapd %xmm3, 96(%r11)
3873 movapd %xmm7, 112(%r11)
3874
3875 jmp 3f
3876
38770:
3878
3879 movq %r11, %rbx // D0
3880 addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
3881
3882 cmpl $1, %r10d
3883 jg 1f
3884
3885 ///////////////
3886 // offset==1 //
3887 ///////////////
3888
3889 // TODO
3890
3891 jmp 3f
3892
38931:
3894
3895 cmpl $2, %r10d
3896 jg 2f
3897
3898 ///////////////
3899 // offset==2 //
3900 ///////////////
3901
3902 // TODO
3903
3904 jmp 3f
3905
39062:
3907
3908 ///////////////
3909 // offset==3 //
3910 ///////////////
3911
3912 // TODO
3913
39143:
3915
3916#if MACRO_LEVEL>=1
3917 .endm
3918#else
3919 ret
3920
3921#if defined(OS_LINUX)
3922 .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
3923#endif
3924#endif
3925
3926
3927
3928
3929
3930// common inner routine with file scope
3931//
3932// store n lower triangular
3933//
3934// input arguments:
3935// r10 <- D
3936// xmm0 <- [d00 d10]
3937// xmm1 <- [d20 d30]
3938// xmm2 <- [d01 d11]
3939// xmm3 <- [d21 d31]
3940// xmm0 <- [d02 d12]
3941// xmm1 <- [d22 d32]
3942// xmm2 <- [d03 d13]
3943// xmm3 <- [d23 d33]
3944// xmm8 <- dirty
3945// xmm9 <- dirty
3946// xmm10 <- dirty
3947// xmm11 <- dirty
3948// xmm12 <- dirty
3949// xmm13 <- dirty
3950// xmm14 <- dirty
3951// xmm15 <- dirty
3952//
3953// output arguments:
3954// r10 <- D
3955// xmm0 <- [d00 d10]
3956// xmm1 <- [d20 d30]
3957// xmm2 <- [d01 d11]
3958// xmm3 <- [d21 d31]
3959// xmm0 <- [d02 d12]
3960// xmm1 <- [d22 d32]
3961// xmm2 <- [d03 d13]
3962// xmm3 <- [d23 d33]
3963// xmm8 <- dirty
3964// xmm9 <- dirty
3965// xmm10 <- dirty
3966// xmm11 <- dirty
3967// xmm12 <- dirty
3968// xmm13 <- dirty
3969// xmm14 <- dirty
3970// xmm15 <- dirty
3971
3972#if MACRO_LEVEL>=1
3973 .macro INNER_STORE_L_4X4_LIB4
3974#else
3975 .p2align 4,,15
3976#if defined(OS_LINUX)
3977 .type inner_store_l_4x4_lib4, @function
3978inner_store_l_4x4_lib4:
3979#elif defined(OS_MAC)
3980_inner_store_l_4x4_lib4:
3981#elif defined(OS_WINDOWS)
3982 .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
3983inner_store_l_4x4_lib4:
3984#endif
3985#endif
3986
3987 movapd %xmm0, 0(%r10)
3988 movapd %xmm4, 16(%r10)
3989 movsd 32(%r10), %xmm15
3990 movsd %xmm15, %xmm1
3991 movapd %xmm1, 32(%r10)
3992 movapd %xmm5, 48(%r10)
3993// movapd %xmm2, 64(%r10)
3994 movapd %xmm6, 80(%r10)
3995// movapd %xmm3, 96(%r10)
3996 movsd 112(%r10), %xmm15
3997 movsd %xmm15, %xmm7
3998 movapd %xmm7, 112(%r10)
3999
4000#if MACRO_LEVEL>=1
4001 .endm
4002#else
4003 ret
4004
4005#if defined(OS_LINUX)
4006 .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
4007#endif
4008#endif
4009
4010
4011
4012
4013
4014// common inner routine with file scope
4015//
4016// store n vs lower triangular
4017//
4018// input arguments:
4019// r10 <- D
4020// r11d <- km
4021// r12d <- kn
4022// xmm0 <- [d00 d10]
4023// xmm1 <- [d20 d30]
4024// xmm2 <- [d01 d11]
4025// xmm3 <- [d21 d31]
4026// xmm0 <- [d02 d12]
4027// xmm1 <- [d22 d32]
4028// xmm2 <- [d03 d13]
4029// xmm3 <- [d23 d33]
4030// xmm8 <- dirty
4031// xmm9 <- dirty
4032// xmm10 <- dirty
4033// xmm11 <- dirty
4034// xmm12 <- dirty
4035// xmm13 <- dirty
4036// xmm14 <- dirty
4037// xmm15 <- dirty
4038//
4039// output arguments:
4040// r10 <- D
4041// r11d <- km
4042// r12d <- kn
4043// xmm0 <- [d00 d10]
4044// xmm1 <- [d20 d30]
4045// xmm2 <- [d01 d11]
4046// xmm3 <- [d21 d31]
4047// xmm0 <- [d02 d12]
4048// xmm1 <- [d22 d32]
4049// xmm2 <- [d03 d13]
4050// xmm3 <- [d23 d33]
4051// xmm8 <- dirty
4052// xmm9 <- dirty
4053// xmm10 <- dirty
4054// xmm11 <- dirty
4055// xmm12 <- dirty
4056// xmm13 <- dirty
4057// xmm14 <- dirty
4058// xmm15 <- dirty
4059
4060#if MACRO_LEVEL>=1
4061 .macro INNER_STORE_L_4X4_VS_LIB4
4062#else
4063 .p2align 4,,15
4064#if defined(OS_LINUX)
4065 .type inner_store_l_4x4_vs_lib4, @function
4066inner_store_l_4x4_vs_lib4:
4067#elif defined(OS_MAC)
4068_inner_store_l_4x4_vs_lib4:
4069#elif defined(OS_WINDOWS)
4070 .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
4071inner_store_l_4x4_vs_lib4:
4072#endif
4073#endif
4074
4075 cmpl $2, %r11d
4076 jg 1f
4077 je 0f
4078
4079 // km==1
4080 movsd %xmm0, 0(%r10)
4081
4082 jmp 3f
4083
40840:
4085 // km==2
4086 cmpl $2, %r12d
4087 movapd %xmm0, 0(%r10)
4088 jl 3f // end
4089 movsd 32(%r10), %xmm15
4090 movsd %xmm15, %xmm1
4091 movapd %xmm1, 32(%r10)
4092
4093 jmp 3f
4094
40951:
4096 cmpl $3, %r11d
4097 jg 2f
4098
4099 // km==3
4100 cmpl $2, %r12d
4101 movapd %xmm0, 0(%r10)
4102 movsd %xmm4, 16(%r10)
4103 jl 3f // end
4104 cmpl $3, %r12d
4105 movsd 32(%r10), %xmm15
4106 movsd %xmm15, %xmm1
4107 movapd %xmm1, 32(%r10)
4108 movsd %xmm5, 48(%r10)
4109 jl 3f // end
4110// movapd %xmm2, 64(%r10)
4111 movsd %xmm6, 80(%r10)
4112
4113 jmp 3f
4114
41152:
4116 // km==3
4117 cmpl $2, %r12d
4118 movapd %xmm0, 0(%r10)
4119 movapd %xmm4, 16(%r10)
4120 jl 3f // end
4121 cmpl $3, %r12d
4122 movsd 32(%r10), %xmm15
4123 movsd %xmm15, %xmm1
4124 movapd %xmm1, 32(%r10)
4125 movapd %xmm5, 48(%r10)
4126 jl 3f // end
4127// movapd %xmm2, 64(%r10)
4128 movapd %xmm6, 80(%r10)
4129 je 3f // end
4130// movapd %xmm3, 96(%r10)
4131 movsd 112(%r10), %xmm15
4132 movsd %xmm15, %xmm7
4133 movapd %xmm7, 112(%r10)
4134
41353:
4136
4137#if MACRO_LEVEL>=1
4138 .endm
4139#else
4140 ret
4141
4142#if defined(OS_LINUX)
4143 .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
4144#endif
4145#endif
4146
4147
4148
4149
4150
4151// rdi rsi rdx rcx r8 r9 rsp+8
4152// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
4153
4154 .p2align 4,,15
4155#if defined(OS_LINUX)
4156 .globl kernel_dgemm_nt_4x4_lib4
4157 .type kernel_dgemm_nt_4x4_lib4, @function
4158kernel_dgemm_nt_4x4_lib4:
4159#elif defined(OS_MAC)
4160 .globl _kernel_dgemm_nt_4x4_lib4
4161_kernel_dgemm_nt_4x4_lib4:
4162#elif defined(OS_WINDOWS)
4163 .globl kernel_dgemm_nt_4x4_lib4
4164 .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
4165kernel_dgemm_nt_4x4_lib4:
4166#endif
4167
4168 PROLOGUE
4169
4170 // zero accumulation registers
4171
4172 xorpd %xmm0, %xmm0
4173 movapd %xmm0, %xmm1
4174 movapd %xmm0, %xmm2
4175 movapd %xmm0, %xmm3
4176 movapd %xmm0, %xmm4
4177 movapd %xmm0, %xmm5
4178 movapd %xmm0, %xmm6
4179 movapd %xmm0, %xmm7
4180
4181
4182 // call inner dgemm kernel nt
4183
4184 movq ARG1, %r10 // k
4185 movq ARG3, %r11 // A
4186 movq ARG4, %r12 // B
4187
4188#if MACRO_LEVEL>=2
4189 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
4190#else
4191#if defined(OS_LINUX) | defined(OS_WINDOWS)
4192 call inner_kernel_dgemm_add_nt_4x4_lib4
4193#elif defined(OS_MAC)
4194 callq _inner_kernel_dgemm_add_nt_4x4_lib4
4195#endif
4196#endif
4197
4198
4199 // call inner blend scale
4200
4201 movq ARG2, %r10 // alpha
4202 movq ARG5, %r11 // beta
4203 movq ARG6, %r12 // C
4204
4205#if MACRO_LEVEL>=1
4206 INNER_BLEND_SCALE_AB_4X4_LIB4
4207#else
4208#if defined(OS_LINUX) | defined(OS_WINDOWS)
4209 call inner_blend_scale_ab_4x4_lib4
4210#elif defined(OS_MAC)
4211 callq _inner_blend_scale_ab_4x4_lib4
4212#endif
4213#endif
4214
4215
4216 // store n
4217
4218 movq ARG7, %r10 // D
4219
4220#if MACRO_LEVEL>=1
4221 INNER_STORE_4X4_LIB4
4222#else
4223#if defined(OS_LINUX) | defined(OS_WINDOWS)
4224 call inner_store_4x4_lib4
4225#elif defined(OS_MAC)
4226 callq _inner_store_4x4_lib4
4227#endif
4228#endif
4229
4230
4231 EPILOGUE
4232
4233 ret
4234
4235#if defined(OS_LINUX)
4236 .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
4237#endif
4238
4239
4240
4241
4242
4243// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
4244// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
4245
4246 .p2align 4,,15
4247#if defined(OS_LINUX)
4248 .globl kernel_dgemm_nt_4x4_vs_lib4
4249 .type kernel_dgemm_nt_4x4_vs_lib4, @function
4250kernel_dgemm_nt_4x4_vs_lib4:
4251#elif defined(OS_MAC)
4252 .globl _kernel_dgemm_nt_4x4_vs_lib4
4253_kernel_dgemm_nt_4x4_vs_lib4:
4254#elif defined(OS_WINDOWS)
4255 .globl kernel_dgemm_nt_4x4_vs_lib4
4256 .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
4257kernel_dgemm_nt_4x4_vs_lib4:
4258#endif
4259
4260 PROLOGUE
4261
4262 // zero accumulation registers
4263
4264 xorpd %xmm0, %xmm0
4265 movapd %xmm0, %xmm1
4266 movapd %xmm0, %xmm2
4267 movapd %xmm0, %xmm3
4268 movapd %xmm0, %xmm4
4269 movapd %xmm0, %xmm5
4270 movapd %xmm0, %xmm6
4271 movapd %xmm0, %xmm7
4272
4273
4274 // call inner dgemm kernel nt
4275
4276 movq ARG1, %r10 // k
4277 movq ARG3, %r11 // A
4278 movq ARG4, %r12 // B
4279
4280#if MACRO_LEVEL>=2
4281 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
4282#else
4283#if defined(OS_LINUX) | defined(OS_WINDOWS)
4284 call inner_kernel_dgemm_add_nt_4x4_lib4
4285#elif defined(OS_MAC)
4286 callq _inner_kernel_dgemm_add_nt_4x4_lib4
4287#endif
4288#endif
4289
4290
4291 // call inner blend
4292
4293 movq ARG2, %r10 // alpha
4294 movq ARG5, %r11 // beta
4295 movq ARG6, %r12 // C
4296
4297#if MACRO_LEVEL>=1
4298 INNER_BLEND_SCALE_AB_4X4_LIB4
4299#else
4300#if defined(OS_LINUX) | defined(OS_WINDOWS)
4301 call inner_blend_scale_ab_4x4_lib4
4302#elif defined(OS_MAC)
4303 callq _inner_blend_scale_ab_4x4_lib4
4304#endif
4305#endif
4306
4307
4308 // store n
4309
4310 movq ARG7, %r10 // D
4311 movq ARG8, %r11 // km
4312 movq ARG9, %r12 // kn
4313
4314#if MACRO_LEVEL>=1
4315 INNER_STORE_4X4_VS_LIB4
4316#else
4317#if defined(OS_LINUX) | defined(OS_WINDOWS)
4318 call inner_store_4x4_vs_lib4
4319#elif defined(OS_MAC)
4320 callq _inner_store_4x4_vs_lib4
4321#endif
4322#endif
4323
4324
4325 EPILOGUE
4326
4327 ret
4328
4329#if defined(OS_LINUX)
4330 .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
4331#endif
4332
4333
4334
4335
4336
4337#if 0
4338
4339// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
4340// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
4341
4342 .p2align 4,,15
4343#if defined(OS_LINUX)
4344 .globl kernel_dgemm_nt_4x4_gen_lib4
4345 .type kernel_dgemm_nt_4x4_gen_lib4, @function
4346kernel_dgemm_nt_4x4_gen_lib4:
4347#elif defined(OS_MAC)
4348 .globl _kernel_dgemm_nt_4x4_gen_lib4
4349_kernel_dgemm_nt_4x4_gen_lib4:
4350#elif defined(OS_WINDOWS)
4351 .globl kernel_dgemm_nt_4x4_gen_lib4
4352 .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
4353kernel_dgemm_nt_4x4_gen_lib4:
4354#endif
4355
4356 PROLOGUE
4357
4358 // zero accumulation registers
4359
4360 xorpd %xmm0, %xmm0
4361 movapd %xmm0, %xmm1
4362 movapd %xmm0, %xmm2
4363 movapd %xmm0, %xmm3
4364 movapd %xmm0, %xmm4
4365 movapd %xmm0, %xmm5
4366 movapd %xmm0, %xmm6
4367 movapd %xmm0, %xmm7
4368
4369
4370 // call inner dgemm kernel nt
4371
4372 movq ARG1, %r10 // k
4373 movq ARG3, %r11 // A
4374 movq ARG4, %r12 // B
4375
4376#if MACRO_LEVEL>=2
4377 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
4378#else
4379#if defined(OS_LINUX) | defined(OS_WINDOWS)
4380 call inner_kernel_dgemm_add_nt_4x4_lib4
4381#elif defined(OS_MAC)
4382 callq _inner_kernel_dgemm_add_nt_4x4_lib4
4383#endif
4384#endif
4385
4386
4387 // call inner blend scale
4388
4389#if 0 //
4390
4391 movq ARG2, %r10 // alpha
4392 movq ARG5, %r11 // beta
4393 movq ARG6, %r12 // offsetC
4394 movq ARG7, %r13 // C
4395 movq ARG8, %r14 // sdc
4396 sall $5, %r14d // 4*sdc*sizeof(double)
4397
4398#if MACRO_LEVEL>=1
4399 INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
4400#else
4401#if defined(OS_LINUX) | defined(OS_WINDOWS)
4402 call inner_blend_scale_ab_4x4_gen_lib4
4403#elif defined(OS_MAC)
4404 callq _inner_blend_scale_ab_4x4_gen_lib4
4405#endif
4406#endif
4407
4408#else //
4409
4410 movq ARG2, %r10 // alpha
4411 movq ARG5, %r11 // beta
4412 movq ARG7, %r12 // C
4413
4414#if MACRO_LEVEL>=1
4415 INNER_BLEND_SCALE_AB_4X4_LIB4
4416#else
4417#if defined(OS_LINUX) | defined(OS_WINDOWS)
4418 call inner_blend_scale_ab_4x4_lib4
4419#elif defined(OS_MAC)
4420 callq _inner_blend_scale_ab_4x4_lib4
4421#endif
4422#endif
4423
4424#endif //
4425
4426 // store n gen
4427
4428 movq ARG9, %r10 // offsetD
4429 movq ARG10, %r11 // D
4430 movq ARG11, %r12 // sdd
4431 sall $5, %r12d // 4*sdb*sizeof(double)
4432 movq ARG12, %r13 // m0
4433 movq ARG13, %r14 // m1
4434 movq ARG14, %r15 // n0
4435 movq ARG15, %rax // n1
4436
4437#if MACRO_LEVEL>=1
4438 INNER_STORE_4X4_GEN_LIB4
4439#else
4440#if defined(OS_LINUX) | defined(OS_WINDOWS)
4441 call inner_store_4x4_gen_lib4
4442#elif defined(OS_MAC)
4443 callq _inner_store_4x4_gen_lib4
4444#endif
4445#endif
4446
4447
4448 EPILOGUE
4449
4450 ret
4451
4452#if defined(OS_LINUX)
4453 .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
4454#endif
4455
4456#endif
4457
4458
4459
4460
4461
4462// 1 2 3 4 5 6 7 8 9
4463// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
4464
4465 .p2align 4,,15
4466#if defined(OS_LINUX)
4467 .globl kernel_dgemm_nn_4x4_lib4
4468 .type kernel_dgemm_nn_4x4_lib4, @function
4469kernel_dgemm_nn_4x4_lib4:
4470#elif defined(OS_MAC)
4471 .globl _kernel_dgemm_nn_4x4_lib4
4472_kernel_dgemm_nn_4x4_lib4:
4473#elif defined(OS_WINDOWS)
4474 .globl kernel_dgemm_nn_4x4_lib4
4475 .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
4476kernel_dgemm_nn_4x4_lib4:
4477#endif
4478
4479 PROLOGUE
4480
4481 // zero accumulation registers
4482
4483 xorpd %xmm0, %xmm0
4484 movapd %xmm0, %xmm1
4485 movapd %xmm0, %xmm2
4486 movapd %xmm0, %xmm3
4487 movapd %xmm0, %xmm4
4488 movapd %xmm0, %xmm5
4489 movapd %xmm0, %xmm6
4490 movapd %xmm0, %xmm7
4491
4492
4493 // call inner dgemm kernel nt
4494
4495 movq ARG1, %r10 // k
4496 movq ARG3, %r11 // A
4497 movq ARG5, %r12 // B
4498 movq ARG6, %r13 // sdb
4499 sall $5, %r13d // 4*sdb*sizeof(double)
4500 movq ARG4, %r14 // offsetB
4501
4502#if MACRO_LEVEL>=1
4503 INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
4504#else
4505#if defined(OS_LINUX) | defined(OS_WINDOWS)
4506 call inner_edge_dgemm_add_nn_4x4_lib4
4507#elif defined(OS_MAC)
4508 callq _inner_edge_dgemm_add_nn_4x4_lib4
4509#endif
4510#endif
4511
4512#if MACRO_LEVEL>=2
4513 INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
4514#else
4515#if defined(OS_LINUX) | defined(OS_WINDOWS)
4516 call inner_kernel_dgemm_add_nn_4x4_lib4
4517#elif defined(OS_MAC)
4518 callq _inner_kernel_dgemm_add_nn_4x4_lib4
4519#endif
4520#endif
4521
4522
4523 // call inner blend scale
4524
4525 movq ARG2, %r10 // alpha
4526 movq ARG7, %r11 // beta
4527 movq ARG8, %r12 // C
4528
4529#if MACRO_LEVEL>=1
4530 INNER_SCALE_AB_4X4_LIB4
4531#else
4532#if defined(OS_LINUX) | defined(OS_WINDOWS)
4533 call inner_scale_ab_4x4_lib4
4534#elif defined(OS_MAC)
4535 callq _inner_scale_ab_4x4_lib4
4536#endif
4537#endif
4538
4539
4540 // store n
4541
4542 movq ARG9, %r10 // D
4543
4544#if MACRO_LEVEL>=1
4545 INNER_STORE_4X4_LIB4
4546#else
4547#if defined(OS_LINUX) | defined(OS_WINDOWS)
4548 call inner_store_4x4_lib4
4549#elif defined(OS_MAC)
4550 callq _inner_store_4x4_lib4
4551#endif
4552#endif
4553
4554
4555 EPILOGUE
4556
4557 ret
4558
4559#if defined(OS_LINUX)
4560 .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
4561#endif
4562
4563
4564
4565
4566
4567// 1 2 3 4 5 6 7 8 9 10 11
4568// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
4569
4570 .p2align 4,,15
4571#if defined(OS_LINUX)
4572 .globl kernel_dgemm_nn_4x4_vs_lib4
4573 .type kernel_dgemm_nn_4x4_vs_lib4, @function
4574kernel_dgemm_nn_4x4_vs_lib4:
4575#elif defined(OS_MAC)
4576 .globl _kernel_dgemm_nn_4x4_vs_lib4
4577_kernel_dgemm_nn_4x4_vs_lib4:
4578#elif defined(OS_WINDOWS)
4579 .globl kernel_dgemm_nn_4x4_vs_lib4
4580 .def kernel_dgemm_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
4581kernel_dgemm_nn_4x4_vs_lib4:
4582#endif
4583
4584 PROLOGUE
4585
4586 // zero accumulation registers
4587
4588 xorpd %xmm0, %xmm0
4589 movapd %xmm0, %xmm1
4590 movapd %xmm0, %xmm2
4591 movapd %xmm0, %xmm3
4592 movapd %xmm0, %xmm4
4593 movapd %xmm0, %xmm5
4594 movapd %xmm0, %xmm6
4595 movapd %xmm0, %xmm7
4596
4597
4598 // call inner dgemm kernel nt
4599
4600 movq ARG1, %r10 // k
4601 movq ARG3, %r11 // A
4602 movq ARG5, %r12 // B
4603 movq ARG6, %r13 // sdb
4604 sall $5, %r13d // 4*sdb*sizeof(double)
4605 movq ARG4, %r14 // offsetB
4606
4607#if MACRO_LEVEL>=1
4608 INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
4609#else
4610#if defined(OS_LINUX) | defined(OS_WINDOWS)
4611 call inner_edge_dgemm_add_nn_4x4_lib4
4612#elif defined(OS_MAC)
4613 callq _inner_edge_dgemm_add_nn_4x4_lib4
4614#endif
4615#endif
4616
4617#if MACRO_LEVEL>=2
4618 INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
4619#else
4620#if defined(OS_LINUX) | defined(OS_WINDOWS)
4621 call inner_kernel_dgemm_add_nn_4x4_lib4
4622#elif defined(OS_MAC)
4623 callq _inner_kernel_dgemm_add_nn_4x4_lib4
4624#endif
4625#endif
4626
4627
4628 // call inner blend scale
4629
4630 movq ARG2, %r10 // alpha
4631 movq ARG7, %r11 // beta
4632 movq ARG8, %r12 // C
4633
4634#if MACRO_LEVEL>=1
4635 INNER_SCALE_AB_4X4_LIB4
4636#else
4637#if defined(OS_LINUX) | defined(OS_WINDOWS)
4638 call inner_scale_ab_4x4_lib4
4639#elif defined(OS_MAC)
4640 callq _inner_scale_ab_4x4_lib4
4641#endif
4642#endif
4643
4644
4645 // store n
4646
4647 movq ARG9, %r10 // D
4648 movq ARG10, %r11 // km
4649 movq ARG11, %r12 // kn
4650
4651#if MACRO_LEVEL>=1
4652 INNER_STORE_4X4_VS_LIB4
4653#else
4654#if defined(OS_LINUX) | defined(OS_WINDOWS)
4655 call inner_store_4x4_vs_lib4
4656#elif defined(OS_MAC)
4657 callq _inner_store_4x4_vs_lib4
4658#endif
4659#endif
4660
4661
4662 EPILOGUE
4663
4664 ret
4665
4666#if defined(OS_LINUX)
4667 .size kernel_dgemm_nn_4x4_vs_lib4, .-kernel_dgemm_nn_4x4_vs_lib4
4668#endif
4669
4670
4671
4672
4673
4674// rdi rsi rdx rcx r8 r9 rsp+8
4675// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
4676
4677 .p2align 4,,15
4678#if defined(OS_LINUX)
4679 .globl kernel_dsyrk_nt_l_4x4_lib4
4680 .type kernel_dsyrk_nt_l_4x4_lib4, @function
4681kernel_dsyrk_nt_l_4x4_lib4:
4682#elif defined(OS_MAC)
4683 .globl _kernel_dsyrk_nt_l_4x4_lib4
4684_kernel_dsyrk_nt_l_4x4_lib4:
4685#elif defined(OS_WINDOWS)
4686 .globl kernel_dsyrk_nt_l_4x4_lib4
4687 .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
4688kernel_dsyrk_nt_l_4x4_lib4:
4689#endif
4690
4691 PROLOGUE
4692
4693 // zero accumulation registers
4694
4695 xorpd %xmm0, %xmm0
4696 movapd %xmm0, %xmm1
4697 movapd %xmm0, %xmm2
4698 movapd %xmm0, %xmm3
4699 movapd %xmm0, %xmm4
4700 movapd %xmm0, %xmm5
4701 movapd %xmm0, %xmm6
4702 movapd %xmm0, %xmm7
4703
4704
4705 // call inner dgemm kernel nt
4706
4707 movq ARG1, %r10 // k
4708 movq ARG3, %r11 // A
4709 movq ARG4, %r12 // B
4710
4711#if MACRO_LEVEL>=2
4712 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
4713#else
4714#if defined(OS_LINUX) | defined(OS_WINDOWS)
4715 call inner_kernel_dgemm_add_nt_4x4_lib4
4716#elif defined(OS_MAC)
4717 callq _inner_kernel_dgemm_add_nt_4x4_lib4
4718#endif
4719#endif
4720
4721
4722 // call inner blend
4723
4724 movq ARG2, %r10 // alpha
4725 movq ARG5, %r11 // beta
4726 movq ARG6, %r12 // C
4727
4728#if MACRO_LEVEL>=1
4729 INNER_BLEND_SCALE_AB_4X4_LIB4
4730#else
4731#if defined(OS_LINUX) | defined(OS_WINDOWS)
4732 call inner_blend_scale_ab_4x4_lib4
4733#elif defined(OS_MAC)
4734 callq _inner_blend_scale_ab_4x4_lib4
4735#endif
4736#endif
4737
4738
4739 // store n
4740
4741 movq ARG7, %r10 // D
4742
4743
4744#if MACRO_LEVEL>=1
4745 INNER_STORE_L_4X4_LIB4
4746#else
4747#if defined(OS_LINUX) | defined(OS_WINDOWS)
4748 call inner_store_l_4x4_lib4
4749#elif defined(OS_MAC)
4750 callq _inner_store_l_4x4_lib4
4751#endif
4752#endif
4753
4754
4755
4756 EPILOGUE
4757
4758 ret
4759
4760#if defined(OS_LINUX)
4761 .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
4762#endif
4763
4764
4765
4766
4767
4768// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
4769// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
4770
4771 .p2align 4,,15
4772#if defined(OS_LINUX)
4773 .globl kernel_dsyrk_nt_l_4x4_vs_lib4
4774 .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
4775kernel_dsyrk_nt_l_4x4_vs_lib4:
4776#elif defined(OS_MAC)
4777 .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
4778_kernel_dsyrk_nt_l_4x4_vs_lib4:
4779#elif defined(OS_WINDOWS)
4780 .globl kernel_dsyrk_nt_l_4x4_vs_lib4
4781 .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
4782kernel_dsyrk_nt_l_4x4_vs_lib4:
4783#endif
4784
4785 PROLOGUE
4786
4787 // zero accumulation registers
4788
4789 xorpd %xmm0, %xmm0
4790 movapd %xmm0, %xmm1
4791 movapd %xmm0, %xmm2
4792 movapd %xmm0, %xmm3
4793 movapd %xmm0, %xmm4
4794 movapd %xmm0, %xmm5
4795 movapd %xmm0, %xmm6
4796 movapd %xmm0, %xmm7
4797
4798
4799 // call inner dgemm kernel nt
4800
4801 movq ARG1, %r10 // k
4802 movq ARG3, %r11 // A
4803 movq ARG4, %r12 // B
4804
4805#if MACRO_LEVEL>=2
4806 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
4807#else
4808#if defined(OS_LINUX) | defined(OS_WINDOWS)
4809 call inner_kernel_dgemm_add_nt_4x4_lib4
4810#elif defined(OS_MAC)
4811 callq _inner_kernel_dgemm_add_nt_4x4_lib4
4812#endif
4813#endif
4814
4815
4816 // call inner blend
4817
4818 movq ARG2, %r10 // alpha
4819 movq ARG5, %r11 // beta
4820 movq ARG6, %r12 // C
4821
4822#if MACRO_LEVEL>=1
4823 INNER_BLEND_SCALE_AB_4X4_LIB4
4824#else
4825#if defined(OS_LINUX) | defined(OS_WINDOWS)
4826 call inner_blend_scale_ab_4x4_lib4
4827#elif defined(OS_MAC)
4828 callq _inner_blend_scale_ab_4x4_lib4
4829#endif
4830#endif
4831
4832
4833 // store n
4834
4835 movq ARG7, %r10 // D
4836 movq ARG8, %r11 // km
4837 movq ARG9, %r12 // kn
4838
4839
4840#if MACRO_LEVEL>=1
4841 INNER_STORE_L_4X4_VS_LIB4
4842#else
4843#if defined(OS_LINUX) | defined(OS_WINDOWS)
4844 call inner_store_l_4x4_vs_lib4
4845#elif defined(OS_MAC)
4846 callq _inner_store_l_4x4_vs_lib4
4847#endif
4848#endif
4849
4850
4851
4852 EPILOGUE
4853
4854 ret
4855
4856#if defined(OS_LINUX)
4857 .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
4858#endif
4859
4860
4861
4862
4863
4864// rdi rsi rdx rcx r8 r9 rsp+8
4865// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
4866
4867 .p2align 4,,15
4868#if defined(OS_LINUX)
4869 .globl kernel_dtrmm_nt_ru_4x4_lib4
4870 .type kernel_dtrmm_nt_ru_4x4_lib4, @function
4871kernel_dtrmm_nt_ru_4x4_lib4:
4872#elif defined(OS_MAC)
4873 .globl _kernel_dtrmm_nt_ru_4x4_lib4
4874_kernel_dtrmm_nt_ru_4x4_lib4:
4875#elif defined(OS_WINDOWS)
4876 .globl kernel_dtrmm_nt_ru_4x4_lib4
4877 .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
4878kernel_dtrmm_nt_ru_4x4_lib4:
4879#endif
4880
4881 PROLOGUE
4882
4883 // zero accumulation registers
4884
4885 xorpd %xmm0, %xmm0
4886 movapd %xmm0, %xmm1
4887 movapd %xmm0, %xmm2
4888 movapd %xmm0, %xmm3
4889 movapd %xmm0, %xmm4
4890 movapd %xmm0, %xmm5
4891 movapd %xmm0, %xmm6
4892 movapd %xmm0, %xmm7
4893
4894
4895 // call inner dgemm kernel nt after initial triangle
4896
4897 movq ARG1, %r10 // k
4898 subl $4, %r10d // k-4
4899 movq ARG3, %r11 // A
4900 addq $128, %r11 // A+4*bs
4901 movq ARG4, %r12 // B
4902 addq $128, %r12 // B+4*bs
4903
4904#if MACRO_LEVEL>=2
4905 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
4906#else
4907#if defined(OS_LINUX) | defined(OS_WINDOWS)
4908 call inner_kernel_dgemm_add_nt_4x4_lib4
4909#elif defined(OS_MAC)
4910 callq _inner_kernel_dgemm_add_nt_4x4_lib4
4911#endif
4912#endif
4913
4914
4915 // call inner blend
4916
4917#if MACRO_LEVEL>=1
4918 INNER_BLEND_4X4_LIB4
4919#else
4920#if defined(OS_LINUX) | defined(OS_WINDOWS)
4921 call inner_blend_4x4_lib4
4922#elif defined(OS_MAC)
4923 callq _inner_blend_4x4_lib4
4924#endif
4925#endif
4926
4927
4928 // initial triangle
4929
4930 movq ARG3, %r10
4931 movq ARG4, %r11
4932
4933#if MACRO_LEVEL>=1
4934 INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
4935#else
4936#if defined(OS_LINUX) | defined(OS_WINDOWS)
4937 call inner_edge_dtrmm_nt_ru_4x4_lib4
4938#elif defined(OS_MAC)
4939 callq _inner_edge_dtrmm_nt_ru_4x4_lib4
4940#endif
4941#endif
4942
4943
4944 // call inner scale
4945
4946 movq ARG2, %r10 // alpha
4947 movq ARG5, %r11 // beta
4948 movq ARG6, %r12 // C
4949
4950#if MACRO_LEVEL>=1
4951 INNER_SCALE_AB_4X4_LIB4
4952#else
4953#if defined(OS_LINUX) | defined(OS_WINDOWS)
4954 call inner_scale_ab_4x4_lib4
4955#elif defined(OS_MAC)
4956 callq _inner_scale_ab_4x4_lib4
4957#endif
4958#endif
4959
4960
4961 // store n
4962
4963 movq ARG7, %r10 // D
4964
4965#if MACRO_LEVEL>=1
4966 INNER_STORE_4X4_LIB4
4967#else
4968#if defined(OS_LINUX) | defined(OS_WINDOWS)
4969 call inner_store_4x4_lib4
4970#elif defined(OS_MAC)
4971 callq _inner_store_4x4_lib4
4972#endif
4973#endif
4974
4975
4976 EPILOGUE
4977
4978 ret
4979
4980#if defined(OS_LINUX)
4981 .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
4982#endif
4983
4984
4985
4986
4987
4988// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
4989// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
4990
4991 .p2align 4,,15
4992#if defined(OS_LINUX)
4993 .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
4994 .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
4995kernel_dtrmm_nt_ru_4x4_vs_lib4:
4996#elif defined(OS_MAC)
4997 .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
4998_kernel_dtrmm_nt_ru_4x4_vs_lib4:
4999#elif defined(OS_WINDOWS)
5000 .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
5001 .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
5002kernel_dtrmm_nt_ru_4x4_vs_lib4:
5003#endif
5004
5005 PROLOGUE
5006
5007 // zero accumulation registers
5008
5009 xorpd %xmm0, %xmm0
5010 movapd %xmm0, %xmm1
5011 movapd %xmm0, %xmm2
5012 movapd %xmm0, %xmm3
5013 movapd %xmm0, %xmm4
5014 movapd %xmm0, %xmm5
5015 movapd %xmm0, %xmm6
5016 movapd %xmm0, %xmm7
5017
5018
5019 // call inner dgemm kernel nt after initial triangle
5020
5021 movq ARG1, %r10 // k
5022 subl $4, %r10d // k-4
5023 movq ARG3, %r11 // A
5024 addq $128, %r11 // A+4*bs
5025 movq ARG4, %r12 // B
5026 addq $128, %r12 // B+4*bs
5027
5028#if MACRO_LEVEL>=2
5029 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
5030#else
5031#if defined(OS_LINUX) | defined(OS_WINDOWS)
5032 call inner_kernel_dgemm_add_nt_4x4_lib4
5033#elif defined(OS_MAC)
5034 callq _inner_kernel_dgemm_add_nt_4x4_lib4
5035#endif
5036#endif
5037
5038
5039 // call inner blender nn
5040
5041#if MACRO_LEVEL>=1
5042 INNER_BLEND_4X4_LIB4
5043#else
5044#if defined(OS_LINUX) | defined(OS_WINDOWS)
5045 call inner_blend_4x4_lib4
5046#elif defined(OS_MAC)
5047 callq _inner_blend_4x4_lib4
5048#endif
5049#endif
5050
5051
5052 // initial triangle
5053
5054 movq ARG1, %r10 // k
5055 movq ARG3, %r11 // A
5056 movq ARG4, %r12 // B
5057
5058#if MACRO_LEVEL>=1
5059 INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
5060#else
5061#if defined(OS_LINUX) | defined(OS_WINDOWS)
5062 call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
5063#elif defined(OS_MAC)
5064 callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
5065#endif
5066#endif
5067
5068
5069 // call inner loader nn
5070
5071 movq ARG2, %r10 // alpha
5072 movq ARG5, %r11 // beta
5073 movq ARG6, %r12 // C
5074
5075#if MACRO_LEVEL>=1
5076 INNER_SCALE_AB_4X4_LIB4
5077#else
5078#if defined(OS_LINUX) | defined(OS_WINDOWS)
5079 call inner_scale_ab_4x4_lib4
5080#elif defined(OS_MAC)
5081 callq _inner_scale_ab_4x4_lib4
5082#endif
5083#endif
5084
5085
5086 // store n
5087
5088 movq ARG7, %r10 // D
5089 movq ARG8, %r11 // km
5090 movq ARG9, %r12 // kn
5091
5092#if MACRO_LEVEL>=1
5093 INNER_STORE_4X4_VS_LIB4
5094#else
5095#if defined(OS_LINUX) | defined(OS_WINDOWS)
5096 call inner_store_4x4_vs_lib4
5097#elif defined(OS_MAC)
5098 callq _inner_store_4x4_vs_lib4
5099#endif
5100#endif
5101
5102
5103 EPILOGUE
5104
5105 ret
5106
5107#if defined(OS_LINUX)
5108 .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
5109#endif
5110
5111
5112
5113
5114
5115// edi rsi rdx ecx r8 r9
5116// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
5117
5118 .p2align 4,,15
5119#if defined(OS_LINUX)
5120 .globl kernel_dpotrf_nt_l_4x4_lib4
5121 .type kernel_dpotrf_nt_l_4x4_lib4, @function
5122kernel_dpotrf_nt_l_4x4_lib4:
5123#elif defined(OS_MAC)
5124 .globl _kernel_dpotrf_nt_l_4x4_lib4
5125_kernel_dpotrf_nt_l_4x4_lib4:
5126#elif defined(OS_WINDOWS)
5127 .globl kernel_dpotrf_nt_l_4x4_lib4
5128 .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
5129kernel_dpotrf_nt_l_4x4_lib4:
5130#endif
5131
5132 PROLOGUE
5133
5134 // zero accumulation registers
5135
5136 xorpd %xmm0, %xmm0
5137 movapd %xmm0, %xmm1
5138 movapd %xmm0, %xmm2
5139 movapd %xmm0, %xmm3
5140 movapd %xmm0, %xmm4
5141 movapd %xmm0, %xmm5
5142 movapd %xmm0, %xmm6
5143 movapd %xmm0, %xmm7
5144
5145
5146 // call inner dgemm kernel nt
5147
5148 movq ARG1, %r10
5149 movq ARG2, %r11
5150 movq ARG3, %r12
5151
5152#if MACRO_LEVEL>=2
5153 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5154#else
5155#if defined(OS_LINUX) | defined(OS_WINDOWS)
5156 call inner_kernel_dgemm_sub_nt_4x4_lib4
5157#elif defined(OS_MAC)
5158 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5159#endif
5160#endif
5161
5162
5163 // call inner blender_loader nn
5164
5165 movq ARG4, %r10 // C
5166
5167#if MACRO_LEVEL>=1
5168 INNER_BLEND_SCALE_11_4X4_LIB4
5169#else
5170#if defined(OS_LINUX) | defined(OS_WINDOWS)
5171 call inner_blend_scale_11_4x4_lib4
5172#elif defined(OS_MAC)
5173 callq _inner_blend_scale_11_4x4_lib4
5174#endif
5175#endif
5176
5177
5178 // factorization
5179
5180 movq ARG6, %r10 // inv_diag_D
5181 movl $4, %r11d // kn
5182
5183#if MACRO_LEVEL>=1
5184 INNER_EDGE_DPOTRF_4X4_VS_LIB4
5185#else
5186#if defined(OS_LINUX) | defined(OS_WINDOWS)
5187 call inner_edge_dpotrf_4x4_vs_lib4
5188#elif defined(OS_MAC)
5189 callq _inner_edge_dpotrf_4x4_vs_lib4
5190#endif
5191#endif
5192
5193
5194 // store
5195
5196 movq ARG5, %r10 // D
5197
5198#if MACRO_LEVEL>=1
5199 INNER_STORE_L_4X4_LIB4
5200#else
5201#if defined(OS_LINUX) | defined(OS_WINDOWS)
5202 call inner_store_l_4x4_lib4
5203#elif defined(OS_MAC)
5204 callq _inner_store_l_4x4_lib4
5205#endif
5206#endif
5207
5208
5209 EPILOGUE
5210
5211 ret
5212
5213#if defined(OS_LINUX)
5214 .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
5215#endif
5216
5217
5218
5219
5220
5221// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
5222// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
5223
5224 .p2align 4,,15
5225#if defined(OS_LINUX)
5226 .globl kernel_dpotrf_nt_l_4x4_vs_lib4
5227 .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
5228kernel_dpotrf_nt_l_4x4_vs_lib4:
5229#elif defined(OS_MAC)
5230 .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
5231_kernel_dpotrf_nt_l_4x4_vs_lib4:
5232#elif defined(OS_WINDOWS)
5233 .globl kernel_dpotrf_nt_l_4x4_vs_lib4
5234 .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
5235kernel_dpotrf_nt_l_4x4_vs_lib4:
5236#endif
5237
5238 PROLOGUE
5239
5240 // zero accumulation registers
5241
5242 xorpd %xmm0, %xmm0
5243 movapd %xmm0, %xmm1
5244 movapd %xmm0, %xmm2
5245 movapd %xmm0, %xmm3
5246 movapd %xmm0, %xmm4
5247 movapd %xmm0, %xmm5
5248 movapd %xmm0, %xmm6
5249 movapd %xmm0, %xmm7
5250
5251
5252 // call inner dgemm kernel nt
5253
5254 movq ARG1, %r10
5255 movq ARG2, %r11
5256 movq ARG3, %r12
5257
5258#if MACRO_LEVEL>=2
5259 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5260#else
5261#if defined(OS_LINUX) | defined(OS_WINDOWS)
5262 call inner_kernel_dgemm_sub_nt_4x4_lib4
5263#elif defined(OS_MAC)
5264 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5265#endif
5266#endif
5267
5268
5269 // call inner blender_loader nn
5270
5271 movq ARG4, %r10 // C
5272
5273#if MACRO_LEVEL>=1
5274 INNER_BLEND_SCALE_11_4X4_LIB4
5275#else
5276#if defined(OS_LINUX) | defined(OS_WINDOWS)
5277 call inner_blend_scale_11_4x4_lib4
5278#elif defined(OS_MAC)
5279 callq _inner_blend_scale_11_4x4_lib4
5280#endif
5281#endif
5282
5283
5284 // factorization
5285
5286 movq ARG6, %r10 // inv_diag_D
5287 movq ARG8, %r11 // kn
5288
5289#if MACRO_LEVEL>=1
5290 INNER_EDGE_DPOTRF_4X4_VS_LIB4
5291#else
5292#if defined(OS_LINUX) | defined(OS_WINDOWS)
5293 call inner_edge_dpotrf_4x4_vs_lib4
5294#elif defined(OS_MAC)
5295 callq _inner_edge_dpotrf_4x4_vs_lib4
5296#endif
5297#endif
5298
5299
5300 // store
5301
5302 movq ARG5, %r10 // D
5303 movq ARG7, %r11 // km
5304 movq ARG8, %r12 // kn
5305
5306#if MACRO_LEVEL>=1
5307 INNER_STORE_L_4X4_VS_LIB4
5308#else
5309#if defined(OS_LINUX) | defined(OS_WINDOWS)
5310 call inner_store_l_4x4_vs_lib4
5311#elif defined(OS_MAC)
5312 callq _inner_store_l_4x4_vs_lib4
5313#endif
5314#endif
5315
5316
5317 EPILOGUE
5318
5319 ret
5320
5321#if defined(OS_LINUX)
5322 .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
5323#endif
5324
5325
5326
5327
5328
5329// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
5330// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
5331
5332 .p2align 4,,15
5333#if defined(OS_LINUX)
5334 .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
5335 .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
5336kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
5337#elif defined(OS_MAC)
5338 .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
5339_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
5340#elif defined(OS_WINDOWS)
5341 .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
5342 .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
5343kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
5344#endif
5345
5346 PROLOGUE
5347
5348 // zero accumulation registers
5349
5350 xorpd %xmm0, %xmm0
5351 movapd %xmm0, %xmm1
5352 movapd %xmm0, %xmm2
5353 movapd %xmm0, %xmm3
5354 movapd %xmm0, %xmm4
5355 movapd %xmm0, %xmm5
5356 movapd %xmm0, %xmm6
5357 movapd %xmm0, %xmm7
5358
5359
5360 // call inner dgemm kernel nt add
5361
5362 movq ARG1, %r10 // kp
5363 movq ARG2, %r11 // Ap
5364 movq ARG3, %r12 // Bp
5365
5366#if MACRO_LEVEL>=2
5367 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
5368#else
5369#if defined(OS_LINUX) | defined(OS_WINDOWS)
5370 call inner_kernel_dgemm_add_nt_4x4_lib4
5371#elif defined(OS_MAC)
5372 callq _inner_kernel_dgemm_add_nt_4x4_lib4
5373#endif
5374#endif
5375
5376
5377 // call inner dgemm kernel nt sub
5378
5379 movq ARG4, %r10 // km
5380 movq ARG5, %r11 // Am
5381 movq ARG6, %r12 // Bm
5382
5383#if MACRO_LEVEL>=2
5384 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5385#else
5386#if defined(OS_LINUX) | defined(OS_WINDOWS)
5387 call inner_kernel_dgemm_sub_nt_4x4_lib4
5388#elif defined(OS_MAC)
5389 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5390#endif
5391#endif
5392
5393
5394 // call inner blender_loader nn
5395
5396 movq ARG7, %r10 // C
5397
5398#if MACRO_LEVEL>=1
5399 INNER_BLEND_SCALE_11_4X4_LIB4
5400#else
5401#if defined(OS_LINUX) | defined(OS_WINDOWS)
5402 call inner_blend_scale_11_4x4_lib4
5403#elif defined(OS_MAC)
5404 callq _inner_blend_scale_11_4x4_lib4
5405#endif
5406#endif
5407
5408
5409 // factorization
5410
5411 movq ARG9, %r10 // inv_diag_D
5412 movl $4, %r11d
5413
5414#if MACRO_LEVEL>=1
5415 INNER_EDGE_DPOTRF_4X4_VS_LIB4
5416#else
5417#if defined(OS_LINUX) | defined(OS_WINDOWS)
5418 call inner_edge_dpotrf_4x4_vs_lib4
5419#elif defined(OS_MAC)
5420 callq _inner_edge_dpotrf_4x4_vs_lib4
5421#endif
5422#endif
5423
5424
5425 // store
5426
5427 movq ARG8, %r10 // D
5428
5429#if MACRO_LEVEL>=1
5430 INNER_STORE_L_4X4_LIB4
5431#else
5432#if defined(OS_LINUX) | defined(OS_WINDOWS)
5433 call inner_store_l_4x4_lib4
5434#elif defined(OS_MAC)
5435 callq _inner_store_l_4x4_lib4
5436#endif
5437#endif
5438
5439
5440 EPILOGUE
5441
5442 ret
5443
5444#if defined(OS_LINUX)
5445 .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
5446#endif
5447
5448
5449
5450
5451
5452// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
5453// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
5454
5455 .p2align 4,,15
5456#if defined(OS_LINUX)
5457 .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
5458 .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
5459kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
5460#elif defined(OS_MAC)
5461 .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
5462_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
5463#elif defined(OS_WINDOWS)
5464 .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
5465 .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
5466kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
5467#endif
5468
5469 PROLOGUE
5470
5471 // zero accumulation registers
5472
5473 xorpd %xmm0, %xmm0
5474 movapd %xmm0, %xmm1
5475 movapd %xmm0, %xmm2
5476 movapd %xmm0, %xmm3
5477 movapd %xmm0, %xmm4
5478 movapd %xmm0, %xmm5
5479 movapd %xmm0, %xmm6
5480 movapd %xmm0, %xmm7
5481
5482
5483 // call inner dgemm kernel nt add
5484
5485 movq ARG1, %r10 // kp
5486 movq ARG2, %r11 // Ap
5487 movq ARG3, %r12 // Bp
5488
5489#if MACRO_LEVEL>=2
5490 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
5491#else
5492#if defined(OS_LINUX) | defined(OS_WINDOWS)
5493 call inner_kernel_dgemm_add_nt_4x4_lib4
5494#elif defined(OS_MAC)
5495 callq _inner_kernel_dgemm_add_nt_4x4_lib4
5496#endif
5497#endif
5498
5499
5500 // call inner dgemm kernel nt sub
5501
5502 movq ARG4, %r10 // km
5503 movq ARG5, %r11 // Am
5504 movq ARG6, %r12 // Bm
5505
5506#if MACRO_LEVEL>=2
5507 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5508#else
5509#if defined(OS_LINUX) | defined(OS_WINDOWS)
5510 call inner_kernel_dgemm_sub_nt_4x4_lib4
5511#elif defined(OS_MAC)
5512 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5513#endif
5514#endif
5515
5516
5517 // call inner blender_loader nn
5518
5519 movq ARG7, %r10 // C
5520
5521#if MACRO_LEVEL>=1
5522 INNER_BLEND_SCALE_11_4X4_LIB4
5523#else
5524#if defined(OS_LINUX) | defined(OS_WINDOWS)
5525 call inner_blend_scale_11_4x4_lib4
5526#elif defined(OS_MAC)
5527 callq _inner_blend_scale_11_4x4_lib4
5528#endif
5529#endif
5530
5531
5532 // factorization
5533
5534 movq ARG9, %r10 // inv_diag_D
5535 movq ARG11, %r11 // kn
5536
5537#if MACRO_LEVEL>=1
5538 INNER_EDGE_DPOTRF_4X4_VS_LIB4
5539#else
5540#if defined(OS_LINUX) | defined(OS_WINDOWS)
5541 call inner_edge_dpotrf_4x4_vs_lib4
5542#elif defined(OS_MAC)
5543 callq _inner_edge_dpotrf_4x4_vs_lib4
5544#endif
5545#endif
5546
5547
5548 // store
5549
5550 movq ARG8, %r10 // D
5551 movq ARG10, %r11 // km
5552 movq ARG11, %r12 // kn
5553
5554#if MACRO_LEVEL>=1
5555 INNER_STORE_L_4X4_VS_LIB4
5556#else
5557#if defined(OS_LINUX) | defined(OS_WINDOWS)
5558 call inner_store_l_4x4_vs_lib4
5559#elif defined(OS_MAC)
5560 callq _inner_store_l_4x4_vs_lib4
5561#endif
5562#endif
5563
5564
5565 EPILOGUE
5566
5567 ret
5568
5569#if defined(OS_LINUX)
5570 .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
5571#endif
5572
5573
5574
5575
5576
5577// edi rsi rdx ecx r8 r9 rsp+8
5578// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
5579
5580 .p2align 4,,15
5581#if defined(OS_LINUX)
5582 .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
5583 .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
5584kernel_dtrsm_nt_rl_inv_4x4_lib4:
5585#elif defined(OS_MAC)
5586 .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
5587_kernel_dtrsm_nt_rl_inv_4x4_lib4:
5588#elif defined(OS_WINDOWS)
5589 .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
5590 .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
5591kernel_dtrsm_nt_rl_inv_4x4_lib4:
5592#endif
5593
5594 PROLOGUE
5595
5596 // zero accumulation registers
5597
5598 xorpd %xmm0, %xmm0
5599 movapd %xmm0, %xmm1
5600 movapd %xmm0, %xmm2
5601 movapd %xmm0, %xmm3
5602 movapd %xmm0, %xmm4
5603 movapd %xmm0, %xmm5
5604 movapd %xmm0, %xmm6
5605 movapd %xmm0, %xmm7
5606
5607
5608 // call inner dgemm kernel nt
5609
5610 movq ARG1, %r10
5611 movq ARG2, %r11
5612 movq ARG3, %r12
5613
5614#if MACRO_LEVEL>=2
5615 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5616#else
5617#if defined(OS_LINUX) | defined(OS_WINDOWS)
5618 call inner_kernel_dgemm_sub_nt_4x4_lib4
5619#elif defined(OS_MAC)
5620 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5621#endif
5622#endif
5623
5624
5625 // call inner blender_loader nn
5626
5627 movq ARG4, %r10
5628
5629#if MACRO_LEVEL>=1
5630 INNER_BLEND_SCALE_11_4X4_LIB4
5631#else
5632#if defined(OS_LINUX) | defined(OS_WINDOWS)
5633 call inner_blend_scale_11_4x4_lib4
5634#elif defined(OS_MAC)
5635 callq _inner_blend_scale_11_4x4_lib4
5636#endif
5637#endif
5638
5639
5640 // solve
5641
5642 movq ARG6, %r10 // E
5643 movq ARG7, %r11 // inv_diag_E
5644
5645#if MACRO_LEVEL>=1
5646 INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
5647#else
5648#if defined(OS_LINUX) | defined(OS_WINDOWS)
5649 call inner_edge_dtrsm_rlt_inv_4x4_lib4
5650#elif defined(OS_MAC)
5651 callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
5652#endif
5653#endif
5654
5655
5656 // store
5657
5658 movq ARG5, %r10 // D
5659
5660#if MACRO_LEVEL>=1
5661 INNER_STORE_4X4_LIB4
5662#else
5663#if defined(OS_LINUX) | defined(OS_WINDOWS)
5664 call inner_store_4x4_lib4
5665#elif defined(OS_MAC)
5666 callq _inner_store_4x4_lib4
5667#endif
5668#endif
5669
5670
5671 EPILOGUE
5672
5673 ret
5674
5675#if defined(OS_LINUX)
5676 .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
5677#endif
5678
5679
5680
5681
5682
5683// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
5684// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
5685
5686 .p2align 4,,15
5687#if defined(OS_LINUX)
5688 .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
5689 .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
5690kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
5691#elif defined(OS_MAC)
5692 .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
5693_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
5694#elif defined(OS_WINDOWS)
5695 .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
5696 .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
5697kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
5698#endif
5699
5700 PROLOGUE
5701
5702 // zero accumulation registers
5703
5704 xorpd %xmm0, %xmm0
5705 movapd %xmm0, %xmm1
5706 movapd %xmm0, %xmm2
5707 movapd %xmm0, %xmm3
5708 movapd %xmm0, %xmm4
5709 movapd %xmm0, %xmm5
5710 movapd %xmm0, %xmm6
5711 movapd %xmm0, %xmm7
5712
5713
5714 // call inner dgemm kernel nt add
5715
5716 movq ARG1, %r10 // kp
5717 movq ARG2, %r11 // Ap
5718 movq ARG3, %r12 // Bp
5719
5720#if MACRO_LEVEL>=2
5721 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
5722#else
5723#if defined(OS_LINUX) | defined(OS_WINDOWS)
5724 call inner_kernel_dgemm_add_nt_4x4_lib4
5725#elif defined(OS_MAC)
5726 callq _inner_kernel_dgemm_add_nt_4x4_lib4
5727#endif
5728#endif
5729
5730
5731 // call inner dgemm kernel nt sub
5732
5733 movq ARG4, %r10 // km
5734 movq ARG5, %r11 // Am
5735 movq ARG6, %r12 // Bm
5736
5737#if MACRO_LEVEL>=2
5738 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5739#else
5740#if defined(OS_LINUX) | defined(OS_WINDOWS)
5741 call inner_kernel_dgemm_sub_nt_4x4_lib4
5742#elif defined(OS_MAC)
5743 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5744#endif
5745#endif
5746
5747
5748 // call inner blender_loader nn
5749
5750 movq ARG7, %r10 // C
5751
5752#if MACRO_LEVEL>=1
5753 INNER_BLEND_SCALE_11_4X4_LIB4
5754#else
5755#if defined(OS_LINUX) | defined(OS_WINDOWS)
5756 call inner_blend_scale_11_4x4_lib4
5757#elif defined(OS_MAC)
5758 callq _inner_blend_scale_11_4x4_lib4
5759#endif
5760#endif
5761
5762
5763 // solve
5764
5765 movq ARG9, %r10 // E
5766 movq ARG10, %r11 // inv_diag_E
5767
5768#if MACRO_LEVEL>=1
5769 INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
5770#else
5771#if defined(OS_LINUX) | defined(OS_WINDOWS)
5772 call inner_edge_dtrsm_rlt_inv_4x4_lib4
5773#elif defined(OS_MAC)
5774 callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
5775#endif
5776#endif
5777
5778
5779 // store
5780
5781 movq ARG8, %r10 // D
5782
5783#if MACRO_LEVEL>=1
5784 INNER_STORE_4X4_LIB4
5785#else
5786#if defined(OS_LINUX) | defined(OS_WINDOWS)
5787 call inner_store_4x4_lib4
5788#elif defined(OS_MAC)
5789 callq _inner_store_4x4_lib4
5790#endif
5791#endif
5792
5793
5794 EPILOGUE
5795
5796 ret
5797
5798#if defined(OS_LINUX)
5799 .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
5800#endif
5801
5802
5803
5804
5805
5806// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
5807// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
5808
5809 .p2align 4,,15
5810#if defined(OS_LINUX)
5811 .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
5812 .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
5813kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
5814#elif defined(OS_MAC)
5815 .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
5816_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
5817#elif defined(OS_WINDOWS)
5818 .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
5819 .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
5820kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
5821#endif
5822
5823 PROLOGUE
5824
5825 // zero accumulation registers
5826
5827 xorpd %xmm0, %xmm0
5828 movapd %xmm0, %xmm1
5829 movapd %xmm0, %xmm2
5830 movapd %xmm0, %xmm3
5831 movapd %xmm0, %xmm4
5832 movapd %xmm0, %xmm5
5833 movapd %xmm0, %xmm6
5834 movapd %xmm0, %xmm7
5835
5836
5837 // call inner dgemm kernel nt
5838
5839 movq ARG1, %r10
5840 movq ARG2, %r11
5841 movq ARG3, %r12
5842
5843#if MACRO_LEVEL>=2
5844 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5845#else
5846#if defined(OS_LINUX) | defined(OS_WINDOWS)
5847 call inner_kernel_dgemm_sub_nt_4x4_lib4
5848#elif defined(OS_MAC)
5849 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5850#endif
5851#endif
5852
5853
5854 // call inner blender_loader nn // TODO scale gen
5855
5856 movq ARG4, %r10 // C
5857
5858#if MACRO_LEVEL>=1
5859 INNER_BLEND_SCALE_11_4X4_LIB4
5860#else
5861#if defined(OS_LINUX) | defined(OS_WINDOWS)
5862 call inner_blend_scale_11_4x4_lib4
5863#elif defined(OS_MAC)
5864 callq _inner_blend_scale_11_4x4_lib4
5865#endif
5866#endif
5867
5868
5869 // solve
5870
5871 movq ARG6, %r10 // E
5872 movq ARG7, %r11 // inv_diag_E
5873 movq ARG9, %r12 // kn
5874
5875#if MACRO_LEVEL>=1
5876 INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
5877#else
5878#if defined(OS_LINUX) | defined(OS_WINDOWS)
5879 call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
5880#elif defined(OS_MAC)
5881 callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
5882#endif
5883#endif
5884
5885
5886 // store
5887
5888 movq ARG5, %r10 // D
5889 movq ARG8, %r11 // km
5890 movq ARG9, %r12 // kn
5891
5892#if MACRO_LEVEL>=1
5893 INNER_STORE_4X4_VS_LIB4
5894#else
5895#if defined(OS_LINUX) | defined(OS_WINDOWS)
5896 call inner_store_4x4_vs_lib4
5897#elif defined(OS_MAC)
5898 callq _inner_store_4x4_vs_lib4
5899#endif
5900#endif
5901
5902
5903 EPILOGUE
5904
5905 ret
5906
5907#if defined(OS_LINUX)
5908 .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
5909#endif
5910
5911
5912
5913
5914
5915// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
5916// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
5917
5918 .p2align 4,,15
5919#if defined(OS_LINUX)
5920 .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
5921 .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
5922kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
5923#elif defined(OS_MAC)
5924 .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
5925_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
5926#elif defined(OS_WINDOWS)
5927 .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
5928 .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
5929kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
5930#endif
5931
5932 PROLOGUE
5933
5934 // zero accumulation registers
5935
5936 xorpd %xmm0, %xmm0
5937 movapd %xmm0, %xmm1
5938 movapd %xmm0, %xmm2
5939 movapd %xmm0, %xmm3
5940 movapd %xmm0, %xmm4
5941 movapd %xmm0, %xmm5
5942 movapd %xmm0, %xmm6
5943 movapd %xmm0, %xmm7
5944
5945
5946 // call inner dgemm kernel nt add
5947
5948 movq ARG1, %r10 // kp
5949 movq ARG2, %r11 // Ap
5950 movq ARG3, %r12 // Bp
5951
5952#if MACRO_LEVEL>=2
5953 INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
5954#else
5955#if defined(OS_LINUX) | defined(OS_WINDOWS)
5956 call inner_kernel_dgemm_add_nt_4x4_lib4
5957#elif defined(OS_MAC)
5958 callq _inner_kernel_dgemm_add_nt_4x4_lib4
5959#endif
5960#endif
5961
5962
5963 // call inner dgemm kernel nt sub
5964
5965 movq ARG4, %r10 // km
5966 movq ARG5, %r11 // Am
5967 movq ARG6, %r12 // Bm
5968
5969#if MACRO_LEVEL>=2
5970 INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
5971#else
5972#if defined(OS_LINUX) | defined(OS_WINDOWS)
5973 call inner_kernel_dgemm_sub_nt_4x4_lib4
5974#elif defined(OS_MAC)
5975 callq _inner_kernel_dgemm_sub_nt_4x4_lib4
5976#endif
5977#endif
5978
5979
5980 // call inner blender_loader nn
5981
5982 movq ARG7, %r10 // C
5983
5984#if MACRO_LEVEL>=1
5985 INNER_BLEND_SCALE_11_4X4_LIB4
5986#else
5987#if defined(OS_LINUX) | defined(OS_WINDOWS)
5988 call inner_blend_scale_11_4x4_lib4
5989#elif defined(OS_MAC)
5990 callq _inner_blend_scale_11_4x4_lib4
5991#endif
5992#endif
5993
5994
5995 // solve
5996
5997 movq ARG9, %r10 // E
5998 movq ARG10, %r11 // inv_diag_E
5999 movq ARG12, %r12 // kn
6000
6001#if MACRO_LEVEL>=1
6002 INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
6003#else
6004#if defined(OS_LINUX) | defined(OS_WINDOWS)
6005 call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
6006#elif defined(OS_MAC)
6007 callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
6008#endif
6009#endif
6010
6011
6012 // store
6013
6014 movq ARG8, %r10 // D
6015 movq ARG11, %r11 // km
6016 movq ARG12, %r12 // kn
6017
6018#if MACRO_LEVEL>=1
6019 INNER_STORE_4X4_VS_LIB4
6020#else
6021#if defined(OS_LINUX) | defined(OS_WINDOWS)
6022 call inner_store_4x4_vs_lib4
6023#elif defined(OS_MAC)
6024 callq _inner_store_4x4_vs_lib4
6025#endif
6026#endif
6027
6028
6029 EPILOGUE
6030
6031 ret
6032
6033#if defined(OS_LINUX)
6034 .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
6035#endif
6036
6037
6038
6039
6040
6041// 1 2 3 4 5 6 7
6042// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
6043
6044 .p2align 4,,15
6045#if defined(OS_LINUX)
6046 .globl kernel_dtrmm_nn_rl_4x4_lib4
6047 .type kernel_dtrmm_nn_rl_4x4_lib4, @function
6048kernel_dtrmm_nn_rl_4x4_lib4:
6049#elif defined(OS_MAC)
6050 .globl _kernel_dtrmm_nn_rl_4x4_lib4
6051_kernel_dtrmm_nn_rl_4x4_lib4:
6052#elif defined(OS_WINDOWS)
6053 .globl kernel_dtrmm_nn_rl_4x4_lib4
6054 .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
6055kernel_dtrmm_nn_rl_4x4_lib4:
6056#endif
6057
6058 PROLOGUE
6059
6060 // zero accumulation registers
6061
6062 xorpd %xmm0, %xmm0
6063 movapd %xmm0, %xmm1
6064 movapd %xmm0, %xmm2
6065 movapd %xmm0, %xmm3
6066 movapd %xmm0, %xmm4
6067 movapd %xmm0, %xmm5
6068 movapd %xmm0, %xmm6
6069 movapd %xmm0, %xmm7
6070
6071
6072
6073 // initial triangle
6074
6075 movq ARG1, %r10 // k
6076 movq ARG3, %r11 // A
6077 movq ARG5, %r12 // B
6078 movq ARG6, %r13 // sdb
6079 sall $5, %r13d // 4*sdb*sizeof(double)
6080 movq ARG4, %r14 // offsetB
6081
6082#if MACRO_LEVEL>=1
6083 INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
6084#else
6085#if defined(OS_LINUX) | defined(OS_WINDOWS)
6086 call inner_edge_dtrmm_nn_rl_4x4_lib4
6087#elif defined(OS_MAC)
6088 callq _inner_edge_dtrmm_nn_rl_4x4_lib4
6089#endif
6090#endif
6091
6092 // call inner dgemm kernel nt after initial triangle
6093
6094#if MACRO_LEVEL>=2
6095 INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
6096#else
6097#if defined(OS_LINUX) | defined(OS_WINDOWS)
6098 call inner_kernel_dgemm_add_nn_4x4_lib4
6099#elif defined(OS_MAC)
6100 callq _inner_kernel_dgemm_add_nn_4x4_lib4
6101#endif
6102#endif
6103
6104
6105 // call inner scale
6106
6107 movq ARG2, %r10 // alpha
6108
6109#if MACRO_LEVEL>=1
6110 INNER_SCALE_A0_4X4_LIB4
6111#else
6112#if defined(OS_LINUX) | defined(OS_WINDOWS)
6113 call inner_scale_a0_4x4_lib4
6114#elif defined(OS_MAC)
6115 callq _inner_scale_a0_4x4_lib4
6116#endif
6117#endif
6118
6119
6120 // store n
6121
6122 movq ARG7, %r10 // D
6123
6124#if MACRO_LEVEL>=1
6125 INNER_STORE_4X4_LIB4
6126#else
6127#if defined(OS_LINUX) | defined(OS_WINDOWS)
6128 call inner_store_4x4_lib4
6129#elif defined(OS_MAC)
6130 callq _inner_store_4x4_lib4
6131#endif
6132#endif
6133
6134
6135 EPILOGUE
6136
6137 ret
6138
6139#if defined(OS_LINUX)
6140 .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
6141#endif
6142
6143
6144
6145
6146
6147 // read-only data
6148#if defined(OS_LINUX)
6149 .section .rodata.cst32,"aM",@progbits,32
6150#elif defined(OS_MAC)
6151 .section __TEXT,__const
6152#elif defined(OS_WINDOWS)
6153 .section .rdata,"dr"
6154#endif
6155
6156#if defined(OS_LINUX) | defined(OS_WINDOWS)
6157 .align 32
6158.LC00: // { -1 -1 -1 1 }
6159#elif defined(OS_MAC)
6160LC00: // { -1 -1 -1 1 }
6161 .align 5
6162#endif
6163 .quad -1
6164 .quad -1
6165 .quad -1
6166 .quad 1
6167
6168#if defined(OS_LINUX) | defined(OS_WINDOWS)
6169 .align 32
6170.LC01: // { -1 -1 -1 -1 }
6171#elif defined(OS_MAC)
6172LC01: // { -1 -1 -1 -1 }
6173 .align 5
6174#endif
6175 .quad -1
6176 .quad -1
6177 .quad -1
6178 .quad -1
6179
6180#if defined(OS_LINUX) | defined(OS_WINDOWS)
6181 .align 32
6182.LC02: // { 3.5 2.5 1.5 0.5 }
6183#elif defined(OS_MAC)
6184LC02: // { 3.5 2.5 1.5 0.5 }
6185 .align 5
6186#endif
6187 .long 0
6188 .long 1071644672
6189 .long 0
6190 .long 1073217536
6191 .long 0
6192 .long 1074003968
6193 .long 0
6194 .long 1074528256
6195
6196#if defined(OS_LINUX) | defined(OS_WINDOWS)
6197 .align 32
6198.LC03: // { 7.5 6.5 5.5 4.5 }
6199#elif defined(OS_MAC)
6200LC03: // { 7.5 6.5 5.5 4.5 }
6201 .align 5
6202#endif
6203 .long 0
6204 .long 1074921472
6205 .long 0
6206 .long 1075183616
6207 .long 0
6208 .long 1075445760
6209 .long 0
6210 .long 1075707904
6211
6212#if defined(OS_LINUX) | defined(OS_WINDOWS)
6213 .align 32
6214.LC04: // { 1.0 1.0 1.0 1.0 }
6215#elif defined(OS_MAC)
6216LC04: // { 1.0 1.0 1.0 1.0 }
6217 .align 5
6218#endif
6219 .long 0
6220 .long 1072693248
6221 .long 0
6222 .long 1072693248
6223 .long 0
6224 .long 1072693248
6225 .long 0
6226 .long 1072693248
6227
6228
6229
6230#if defined(OS_LINUX)
6231 .section .note.GNU-stack,"",@progbits
6232#elif defined(OS_MAC)
6233 .subsections_via_symbols
6234#endif
6235