Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/avx/Makefile b/kernel/avx/Makefile
new file mode 100644
index 0000000..f260086
--- /dev/null
+++ b/kernel/avx/Makefile
@@ -0,0 +1,54 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_12_lib4.o kernel_dgemv_8_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o kernel_dgebp_lib4.o
+OBJS += kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgetr_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/avx/kernel_dgebp_lib4.S b/kernel/avx/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..0e8581e
--- /dev/null
+++ b/kernel/avx/kernel_dgebp_lib4.S
@@ -0,0 +1,935 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_8r_lib4(int k, double *A, int sda, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_lib4
+ .type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_lib4
+ .def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+ movq ARG7, %rax // km
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC01(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmaskmovpd 0(%r11, %r12, 1), %ymm15, %ymm4
+ vmaskmovpd 32(%r11, %r12, 1), %ymm15, %ymm5
+ vmaskmovpd 64(%r11, %r12, 1), %ymm15, %ymm6
+ vmaskmovpd 96(%r11, %r12, 1), %ymm15, %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dger4_sub_4r_lib4(int n, double *A, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_lib4
+ .type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_lib4
+ .def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+ movq ARG5, %r14
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC00(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmaskmovpd 0(%r11), %ymm15, %ymm0
+ vmaskmovpd 32(%r11), %ymm15, %ymm1
+ vmaskmovpd 64(%r11), %ymm15, %ymm2
+ vmaskmovpd 96(%r11), %ymm15, %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+ .align 5
+#endif
+ .double 0.5
+ .double 1.5
+ .double 2.5
+ .double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+ .align 5
+#endif
+ .double 4.5
+ .double 5.5
+ .double 6.5
+ .double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+ .align 5
+#endif
+ .double 8.5
+ .double 9.5
+ .double 10.5
+ .double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_dgemm_4x4_lib4.S b/kernel/avx/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..95ff6ea
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9906 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+
+// cmpl $3, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ subl $1, %r10d
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ cmpl $4, %r10d
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+// cmpl $3, %r10d
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 48(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 56(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 32(%r12)
+
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 88(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 64(%r12)
+
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -24(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r14d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A+3*bs*sizeof(double)
+ addq %r13, %r12
+ subq $8, %r12 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r14d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $16, %r12 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r14d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r14d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r14d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r11
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender_loader for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_lib4, @function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+ vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vmovddup %xmm0, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vmovddup %xmm1, %xmm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vmovddup %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vmovddup %xmm13, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vmovddup %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r12d
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ jl 0f // end
+ cmpl $3, %r12d
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ jl 0f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vblendpd $0x4, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x2, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+ movq ARG9, %r12 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_4_lib4
+ .type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_4_lib4
+ .def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm0
+ //
+ vmovapd 32(%r11), %ymm12
+ vaddpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ //
+ vmovapd 64(%r11), %ymm12
+ vaddpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ //
+ vmovapd 96(%r11), %ymm12
+ vaddpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 96(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm12
+ vmovapd %ymm12, 0(%r12)
+ //
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm1, %ymm12
+ vmovapd %ymm12, 32(%r12)
+ //
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm2, %ymm12
+ vmovapd %ymm12, 64(%r12)
+ //
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 104(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 112(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm3, %ymm12
+ vmovapd %ymm12, 96(%r12)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_8x4_lib4.S b/kernel/avx/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..e9f1f34
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,13154 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+// movq %r11, %r15 // A1 <- A0
+// addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 32(%r15), %ymm11 // A1[4]
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 64(%r15), %ymm9 // A1[8]
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 96(%r15), %ymm11 // A1[12]
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $128, %r11
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 32(%r15), %ymm11 // A1[4]
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 64(%r15), %ymm9 // A1[8]
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 96(%r15), %ymm11 // A1[12]
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+// addq $128, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 0(%r15), %ymm9 // A1[0]
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+// cmpl $3, %r10d
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// addq $32, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ subl $1, %r10d
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ cmpl $4, %r10d
+
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+// cmpl $3, %r10d
+
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ addq $32, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ subl $1, %r10d
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+ prefetcht0 128(%r12, %r13, 2) // software prefetch
+ prefetcht0 192(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 48(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 56(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 88(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -24(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, -32(%r12)
+ vmovapd %ymm14, -32(%r12, %r13, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %ebx
+ subl %r15d, %ebx // 4-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,4-offsetB)
+
+ movl %r15d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r13 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A0+1*bs*sizeof(float)
+ addq $8, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm12
+ vmovapd 0(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 32(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 64(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vbroadcastsd 96(%r12), %ymm12
+ vmovapd 96(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 96(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 104(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $128, %r10
+ addq $128, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r11
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r15d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A0+3*bs*sizeof(double)
+ addq %r14, %r13
+ subq $8, %r13 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r15d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A0+2*bs*sizeof(double)
+ addq %r14, %r13
+ subq $16, %r13 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r15d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r15d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r15d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm3, %ymm14, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm4, %ymm14, %ymm4
+ vmovapd 32(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm5, %ymm14, %ymm5
+ vmovapd 64(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm6, %ymm14, %ymm6
+ vmovapd 96(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm7, %ymm14, %ymm7
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm0, %ymm1, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm2, %ymm3, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm12, %ymm14, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm13, %ymm15, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm4, %ymm5, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm6, %ymm7, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm12, %ymm14, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm13, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm3, %ymm14, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm4, %ymm14, %ymm4
+ vmovapd 32(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm5, %ymm14, %ymm5
+ vmovapd 64(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm6, %ymm14, %ymm6
+ vmovapd 96(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm7, %ymm14, %ymm7
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ // alg==1
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_lib4, @function
+inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_lib4, .-inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization vs
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $2, %r11d
+ jl 0f // ret
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $3, %r11d
+ jl 0f // ret
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $4, %r11d
+ jl 0f // ret
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+ // solve top-left
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 32(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ addq $128, %r10
+
+
+ // solve top-left
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10, %r11, 1), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 32(%r10, %r11, 1), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r10, %r11, 1), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+
+ // bottom-right
+
+ vmovapd 224(%r10, %r11, 1), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r10, %r11, 1), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r10, %r11, 1), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ // bottom-right
+
+ cmpl $7, %r13d
+ jle 0f
+
+ vmovapd 224(%r10, %r11, 1), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+0:
+ cmpl $6, %r13d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r10, %r11, 1), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+1:
+ cmpl $5, %r13d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r10, %r11, 1), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+2:
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+// vpermpd $0x00, %ymm1, %ymm13
+ vmovddup %xmm1, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+// vpermpd $0x00, %ymm2, %ymm13
+ vmovddup %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+// vpermpd $0x55, %ymm2, %ymm13
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+// vpermpd $0xaa, %ymm2, %ymm13
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+// vpermpd $0x00, %ymm3, %ymm13
+ vmovddup %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+// vpermpd $0x55, %ymm3, %ymm13
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+// vpermpd $0xaa, %ymm3, %ymm13
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+// vpermpd $0xff, %ymm3, %ymm13
+// vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ jl 0f // end
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovpd %ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0,0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ jl 4f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ jl 4f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm13
+#endif
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x4, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x2, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG13, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ // epilogue
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_8_lib4
+ .type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_8_lib4
+ .def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm0
+ vaddpd %ymm14, %ymm4, %ymm4
+ //
+ vmovapd 32(%r11), %ymm12
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm1, %ymm1
+ vaddpd %ymm14, %ymm5, %ymm5
+ vbroadcastsd 32(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ //
+ vmovapd 64(%r11), %ymm12
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm2, %ymm2
+ vaddpd %ymm14, %ymm6, %ymm6
+ vbroadcastsd 64(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 72(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ //
+ vmovapd 96(%r11), %ymm12
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm3, %ymm3
+ vaddpd %ymm14, %ymm7, %ymm7
+ vbroadcastsd 96(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 104(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 112(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+ movq ARG5, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm12
+ vaddpd %ymm14, %ymm4, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ //
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm1, %ymm12
+ vaddpd %ymm14, %ymm5, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ //
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm2, %ymm12
+ vaddpd %ymm14, %ymm6, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ //
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 104(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 112(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm3, %ymm12
+ vaddpd %ymm14, %ymm7, %ymm14
+ vmovapd %ymm12, 96(%r12)
+ vmovapd %ymm14, 96(%r12, %r13, 1)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_diag_lib4.c b/kernel/avx/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..d64f977
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,866 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+ b_33 = _mm256_broadcast_sd( &B[3] );
+ b_33 = _mm256_mul_pd( b_33, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+ _mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+ b_33 = _mm256_broadcast_sd( &B[3] );
+ b_33 = _mm256_mul_pd( b_33, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+ _mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22,
+ c_00,
+ d_00, d_01, d_02;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11,
+ c_00,
+ d_00, d_01;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix, beta=0.0 case
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0,
+ sign,
+ a_00,
+ b_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256i
+ mask;
+
+ __m256d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ mask = _mm256_set_epi64x( 1, -1, -1, -1 );
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_maskstore_pd( &D[0], mask, d_00 );
+ _mm256_maskstore_pd( &D[4], mask, d_01 );
+ _mm256_maskstore_pd( &D[8], mask, d_02 );
+ _mm256_maskstore_pd( &D[12], mask, d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_maskstore_pd( &D[0], mask, d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m128d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm_loaddup_pd( alpha );
+ beta0 = _mm_loaddup_pd( beta );
+
+ a_00 = _mm_load_pd( &A[0] );
+ a_00 = _mm_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm_load_pd( &B[0] );
+ d_00 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[4] );
+ d_01 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[8] );
+ d_02 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[12] );
+ d_03 = _mm_mul_pd( a_00, b_00 );
+
+ c_00 = _mm_load_pd( &C[0] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_00 = _mm_add_pd( c_00, d_00 );
+ c_00 = _mm_load_pd( &C[4] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_01 = _mm_add_pd( c_00, d_01 );
+ c_00 = _mm_load_pd( &C[8] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_02 = _mm_add_pd( c_00, d_02 );
+ c_00 = _mm_load_pd( &C[12] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_03 = _mm_add_pd( c_00, d_03 );
+
+ _mm_store_pd( &D[0], d_00 );
+ _mm_store_pd( &D[4], d_01 );
+ _mm_store_pd( &D[8], d_02 );
+ _mm_store_pd( &D[12], d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm_load_pd( &B[0] );
+ d_00 = _mm_mul_pd( a_00, b_00 );
+
+ c_00 = _mm_load_pd( &C[0] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_00 = _mm_add_pd( c_00, d_00 );
+
+ _mm_store_pd( &D[0], d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+
+ }
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = A[0] * alpha0;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
diff --git a/kernel/avx/kernel_dgemv_12_lib4.S b/kernel/avx/kernel_dgemv_12_lib4.S
new file mode 100644
index 0000000..c51ad9a
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_12_lib4.S
@@ -0,0 +1,1322 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_12_lib4, @function
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r14 // A1 <- A0
+ addq %r12, %r14 // A1 <- A0 + 4*sda*sizeof(double)
+ movq %r14, %r15 // A2 <- A1
+ addq %r12, %r15 // A2 <- A1 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 0(%r15) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+ prefetcht0 64(%r15) // software prefetch
+
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 128(%r14) // software prefetch
+ prefetcht0 128(%r15) // software prefetch
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmovapd 32(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 192(%r14) // software prefetch
+ prefetcht0 192(%r15) // software prefetch
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13 // x+4
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11 // A0+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmovapd 96(%r14), %ymm8
+ addq $128, %r14 // A1+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15 // A2+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ addq $32, %r11
+ addq $32, %r14
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_12_lib4, .-inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_12_lib4, @function
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 256(%r11) // software prefetch
+ prefetcht0 320(%r11) // software prefetch
+
+ jl 0f // clean-up loop
+
+ movq %r11, %r14
+ addq %r12, %r14 // A+bs*sda
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+
+ vmovupd 0(%r13), %ymm12
+ addq $32, %r13 // x+4
+
+ vmovapd 0(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ prefetcht0 64(%r14) // software prefetch
+
+ vmovapd 64(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 128(%r14) // software prefetch
+
+ vmovapd 128(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r14) // software prefetch
+
+ vmovapd 192(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ prefetcht0 256(%r14) // software prefetch
+
+ vmovapd 256(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm8, %ymm15, %ymm8
+
+ vmovapd 288(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm9, %ymm15, %ymm9
+
+ prefetcht0 320(%r14) // software prefetch
+
+ vmovapd 320(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm10, %ymm15, %ymm10
+
+ vmovapd 352(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm11, %ymm15, %ymm11
+
+// addq %r12, %r11 // A+bs*sda
+ movq %r14, %r11 // A+bs*sda
+ addq %r12, %r14 // A+bs*sda+bs*sda
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 128(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vmovapd 192(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmovapd 256(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm8, %ymm15, %ymm8
+
+ vmovapd 288(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm9, %ymm15, %ymm9
+
+ vmovapd 320(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm10, %ymm15, %ymm10
+
+ vmovapd 352(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm11, %ymm15, %ymm11
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_12_lib4, .-inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_12_lib4, @function
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm1, %ymm4, %ymm1
+ vaddpd %ymm2, %ymm5, %ymm2
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovupd 64(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_12_lib4, .-inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_12_lib4, @function
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm9, %ymm8, %ymm8
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vhaddpd %ymm11, %ymm10, %ymm10
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x2, %ymm8, %ymm10, %ymm9
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vperm2f128 $0x13, %ymm8, %ymm10, %ymm8
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+ vaddpd %ymm8, %ymm9, %ymm2
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovupd 64(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_12_lib4, .-inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_12_lib4, @function
+inner_blender_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm1, %ymm4, %ymm1
+ vaddpd %ymm2, %ymm5, %ymm2
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vsubpd %ymm2, %ymm15, %ymm2
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_12_lib4, .-inner_blender_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_12_lib4, @function
+inner_blender_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm9, %ymm8, %ymm8
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vhaddpd %ymm11, %ymm10, %ymm10
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x2, %ymm8, %ymm10, %ymm9
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vperm2f128 $0x13, %ymm8, %ymm10, %ymm8
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+ vaddpd %ymm8, %ymm9, %ymm2
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vsubpd %ymm2, %ymm15, %ymm2
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_12_lib4, .-inner_blender_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12_lib4, @function
+inner_store_12_lib4:
+#elif defined(OS_MAC)
+_inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+ vmovupd %ymm2, 64(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_12_lib4, .-inner_store_12_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_12_lib4
+ .type kernel_dgemv_n_12_lib4, @function
+kernel_dgemv_n_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_12_lib4
+_kernel_dgemv_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_12_lib4
+ .def kernel_dgemv_n_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_12_lib4, .-kernel_dgemv_n_12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_12_lib4
+ .type kernel_dgemv_t_12_lib4, @function
+kernel_dgemv_t_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_12_lib4
+_kernel_dgemv_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_12_lib4
+ .def kernel_dgemv_t_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_12_lib4, .-kernel_dgemv_t_12_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_4_lib4.S b/kernel/avx/kernel_dgemv_4_lib4.S
new file mode 100644
index 0000000..656e220
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_4_lib4.S
@@ -0,0 +1,4503 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_4_lib4, @function
+inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $32, %r12
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ addq $32, %r11
+ addq $8, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_4_lib4, .-inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_4_lib4, @function
+inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmaskmovpd 0(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmaskmovpd 32(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmaskmovpd 64(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmaskmovpd 96(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_4_lib4, .-inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_4_lib4, @function
+inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm11
+
+ vmaskmovpd 0(%r13), %ymm11, %ymm12
+ vmaskmovpd 0(%r14), %ymm11, %ymm13
+
+// vmovupd %ymm14, -32(%rsp) // spill mask to stack
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 0(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 32(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 64(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 96(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd %ymm13, %ymm11, 0(%r14)
+
+ sall $3, %r10d // *sizeof(double)
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_4_lib4, .-inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemv_add_t_4_lib4, @function
+inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $3, %r15d // offA*sizeof(double)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2sd %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2sd %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm13, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+ vandpd %ymm15, %ymm14, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $4, %r10d // kmax - (4-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemv_add_t_4_lib4, .-inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovupd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_gen_lib4, @function
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#endif
+#endif
+
+ movl $4, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2sd %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm11
+
+ vmaskmovpd 0(%r13), %ymm11, %ymm12
+ vmaskmovpd 0(%r14), %ymm11, %ymm13
+
+ vmaskmovpd 0(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 32(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 64(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 96(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd %ymm13, %ymm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $3, %rax // *sizeof(double)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_gen_lib4, .-inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_lib4, @function
+inner_blend_n_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_lib4; .scl 2; .type 32; .endef
+inner_blend_n_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_lib4, .-inner_blend_n_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_lib4, @function
+inner_blend_t_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_lib4; .scl 2; .type 32; .endef
+inner_blend_t_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_lib4, .-inner_blend_t_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_4_lib4, @function
+inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_4_lib4, .-inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_m11_4_lib4, @function
+inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+ // beta
+ vmovupd 0(%r10), %ymm14
+ vsubpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_m11_4_lib4, .-inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_4_lib4, @function
+inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_4_lib4, .-inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_4_lib4, @function
+inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ vmovupd 0(%r10), %ymm14
+ vsubpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_4_lib4, .-inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_ln_inv_4_lib4, @function
+inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+ vmovapd 0(%r10), %ymm13
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vmovapd 32(%r10), %ymm13
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x3, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vmovapd 64(%r10), %ymm13
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_ln_inv_4_lib4, .-inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS, variable size version
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_ln_inv_4_vs_lib4, @function
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+ vmovapd 0(%r10), %ymm13
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+ vmovapd 32(%r10), %ymm13
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x3, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+ vmovapd 64(%r10), %ymm13
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ // return
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_ln_inv_4_vs_lib4, .-inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_4_lib4, @function
+inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vmovsd 88(%r10), %xmm11
+ vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_4_lib4, .-inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_3_lib4, @function
+inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_3_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vmovsd 88(%r10), %xmm11
+ vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+
+// vbroadcastsd 24(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0x8, %ymm12, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+0:
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_3_lib4, .-inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_2_lib4, @function
+inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_2_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ cmpl $3, %r12d
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+// vmovsd 88(%r10), %xmm11
+// vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+// vblendpd $0x8, %ymm14, %ymm10, %ymm10
+ vblendpd $0xc, %ymm14, %ymm10, %ymm10
+
+// vbroadcastsd 24(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0xc, %ymm12, %ymm0, %ymm0
+
+ je 0f
+ jl 1f
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+
+// vbroadcastsd 16(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_2_lib4, .-inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_1_lib4, @function
+inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_1_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0xe, %ymm12, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ je 0f
+
+ cmpl $2, %r12d
+ je 1f
+ jl 2f
+
+ vmovsd 24(%r10), %xmm10
+ vblendpd $0xe, %ymm14, %ymm10, %ymm10
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+
+ vmovsd 16(%r10), %xmm9
+ vblendpd $0xe, %ymm14, %ymm9, %ymm9
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+
+2:
+
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_1_lib4, .-inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- x+4*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_4_lib4, @function
+inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $32, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_4_lib4, .-inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dtrmv_ut_4_lib4, @function
+inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dtrmv_ut_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jle 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $4, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+// vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+// vmovupd LC02(%rip), %ymm13
+#endif
+// vmovddup %xmm14, %xmm14
+// vinsertf128 $1, %xmm14, %ymm14, %ymm14
+// vsubpd %ymm14, %ymm13, %ymm14
+//
+// vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovupd 0(%r13), %ymm12
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dtrmv_ut_4_lib4, .-inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_lib4, @function
+inner_store_4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_lib4; .scl 2; .type 32; .endef
+inner_store_4_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_lib4, .-inner_store_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_vs_lib4, @function
+inner_store_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_vs_lib4, .-inner_store_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_gen_lib4, @function
+inner_store_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r11d, %xmm14, %xmm14
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_gen_lib4, .-inner_store_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_lib4
+ .type kernel_dgemv_n_4_lib4, @function
+kernel_dgemv_n_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_lib4
+_kernel_dgemv_n_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_lib4
+ .def kernel_dgemv_n_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_lib4, .-kernel_dgemv_n_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_vs_lib4
+ .type kernel_dgemv_n_4_vs_lib4, @function
+kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_vs_lib4
+_kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_vs_lib4
+ .def kernel_dgemv_n_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_vs_lib4, .-kernel_dgemv_n_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemv_n_4_gen_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_gen_lib4
+ .type kernel_dgemv_n_4_gen_lib4, @function
+kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_gen_lib4
+_kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_gen_lib4
+ .def kernel_dgemv_n_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k0
+ movq ARG9, %r12 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_gen_lib4, .-kernel_dgemv_n_4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemv_t_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_lib4
+ .type kernel_dgemv_t_4_lib4, @function
+kernel_dgemv_t_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_lib4
+_kernel_dgemv_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_lib4
+ .def kernel_dgemv_t_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_lib4, .-kernel_dgemv_t_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_vs_lib4
+ .type kernel_dgemv_t_4_vs_lib4, @function
+kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_vs_lib4
+_kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_vs_lib4
+ .def kernel_dgemv_t_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_vs_lib4, .-kernel_dgemv_t_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemv_t_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_gen_lib4
+ .type kernel_dgemv_t_4_gen_lib4, @function
+kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_gen_lib4
+_kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_gen_lib4
+ .def kernel_dgemv_t_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_gen_lib4, .-kernel_dgemv_t_4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dtrsv_ln_inv_4_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_ln_inv_4_lib4
+ .type kernel_dtrsv_ln_inv_4_lib4, @function
+kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_ln_inv_4_lib4
+_kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_ln_inv_4_lib4
+ .def kernel_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_ln_inv_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_ln_inv_4_lib4, .-kernel_dtrsv_ln_inv_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dtrsv_ln_inv_4_vs_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_ln_inv_4_vs_lib4
+ .type kernel_dtrsv_ln_inv_4_vs_lib4, @function
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_ln_inv_4_vs_lib4
+_kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_ln_inv_4_vs_lib4
+ .def kernel_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+ movq %r11, %r13
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_ln_inv_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+ // store vs
+
+ movq ARG6, %r10 // z
+ movq ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_ln_inv_4_vs_lib4, .-kernel_dtrsv_ln_inv_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrsv_lt_inv_4_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_4_lib4
+ .type kernel_dtrsv_lt_inv_4_lib4, @function
+kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_4_lib4
+_kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_4_lib4
+ .def kernel_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_4_lib4, .-kernel_dtrsv_lt_inv_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_3_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_3_lib4
+ .type kernel_dtrsv_lt_inv_3_lib4, @function
+kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_3_lib4
+_kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_3_lib4
+ .def kernel_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_3_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_3_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $3, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_3_lib4, .-kernel_dtrsv_lt_inv_3_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_2_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_2_lib4
+ .type kernel_dtrsv_lt_inv_2_lib4, @function
+kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_2_lib4
+_kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_2_lib4
+ .def kernel_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_2_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_2_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $2, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_2_lib4, .-kernel_dtrsv_lt_inv_2_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_1_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_1_lib4
+ .type kernel_dtrsv_lt_inv_1_lib4, @function
+kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_1_lib4
+_kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_1_lib4
+ .def kernel_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_1_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_1_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $1, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_1_lib4, .-kernel_dtrsv_lt_inv_1_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_dtrmv_un_4_lib4(int k, double *A, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_4_lib4
+ .type kernel_dtrmv_un_4_lib4, @function
+kernel_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_4_lib4
+_kernel_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_4_lib4
+ .def kernel_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG4, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_4_lib4, .-kernel_dtrmv_un_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_ut_4_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_ut_4_lib4
+ .type kernel_dtrmv_ut_4_lib4, @function
+kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_ut_4_lib4
+_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_ut_4_lib4
+ .def kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+ // call inner blend t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_ut_4_lib4, .-kernel_dtrmv_ut_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9
+// void kernel_dtrmv_ut_4_vs_lib4(int k, double *A, int sda, double *x, double *y, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_ut_4_vs_lib4
+ .type kernel_dtrmv_ut_4_vs_lib4, @function
+kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_ut_4_vs_lib4
+_kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_ut_4_vs_lib4
+ .def kernel_dtrmv_ut_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+ // call inner blend t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+ movq ARG6, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_ut_4_vs_lib4, .-kernel_dtrmv_ut_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemv_nt_4_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_4_lib4
+ .type kernel_dgemv_nt_4_lib4, @function
+kernel_dgemv_nt_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_4_lib4
+_kernel_dgemv_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_4_lib4
+ .def kernel_dgemv_nt_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_4_lib4, .-kernel_dgemv_nt_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dgemv_nt_4_vs_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_4_vs_lib4
+ .type kernel_dgemv_nt_4_vs_lib4, @function
+kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_4_vs_lib4
+_kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_4_vs_lib4
+ .def kernel_dgemv_nt_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG12, %r11 // km
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+0:
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+ movq ARG12, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_4_vs_lib4, .-kernel_dgemv_nt_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dsymv_l_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_gen_lib4
+ .type kernel_dsymv_l_4_gen_lib4, @function
+kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_gen_lib4
+_kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_gen_lib4
+ .def kernel_dsymv_l_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_gen_lib4, .-kernel_dsymv_l_4_gen_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_8_lib4.S b/kernel/avx/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..53d371e
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1575 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 0(%r15) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 64(%r15) // software prefetch
+
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 128(%r15) // software prefetch
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 192(%r15) // software prefetch
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13 // x+4
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11 // A0+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15 // A1+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ addq $32, %r11
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 192(%r11) // software prefetch
+
+ jl 0f // clean-up loop
+
+ movq %r11, %r14
+ addq %r12, %r14 // A+bs*sda
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+
+ vmovupd 0(%r13), %ymm12
+ addq $32, %r13 // x+4
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ prefetcht0 64(%r14) // software prefetch
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 128(%r14) // software prefetch
+
+ vmovapd 128(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r14) // software prefetch
+
+ vmovapd 192(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+// addq %r12, %r11 // A+bs*sda
+ movq %r14, %r11 // A+bs*sda
+ addq %r12, %r14 // A+bs*sda+bs*sda
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 128(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vmovapd 192(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ // first 4 columns
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+
+ // last 4 columns
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r15), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vbroadcastsd 24(%r13), %ymm12
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_8_lib4
+ .type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_8_lib4
+ .def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_8_lib4
+ .type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_8_lib4
+ .def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_8_lib4
+ .type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_8_lib4
+ .def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+#if MACRO_LEVEL>=1
+ INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgeqrf_4_lib4.c b/kernel/avx/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..a5faf20
--- /dev/null
+++ b/kernel/avx/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2751 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ ii = 1;
+ if(m>1)
+ {
+ tmp = pD[1+ps*0];
+ beta += tmp*tmp;
+ if(m>2)
+ {
+ tmp = pD[2+ps*0];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*0];
+ beta += tmp*tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ ii = 1;
+ if(m>1)
+ {
+ pD[1+ps*0] *= tmp;
+ if(m>2)
+ {
+ pD[2+ps*0] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*0] *= tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*0] *= tmp;
+ pD[1+ii*sdd+ps*0] *= tmp;
+ pD[2+ii*sdd+ps*0] *= tmp;
+ pD[3+ii*sdd+ps*0] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[0+ps*1];
+ w2 = pD[0+ps*2];
+ w3 = pD[0+ps*3];
+ if(m>1)
+ {
+ w1 += pD[1+ps*1] * pD[1+ps*0];
+ w2 += pD[1+ps*2] * pD[1+ps*0];
+ w3 += pD[1+ps*3] * pD[1+ps*0];
+ if(m>2)
+ {
+ w1 += pD[2+ps*1] * pD[2+ps*0];
+ w2 += pD[2+ps*2] * pD[2+ps*0];
+ w3 += pD[2+ps*3] * pD[2+ps*0];
+ if(m>3)
+ {
+ w1 += pD[3+ps*1] * pD[3+ps*0];
+ w2 += pD[3+ps*2] * pD[3+ps*0];
+ w3 += pD[3+ps*3] * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[0+ps*1] += w1;
+ pD[0+ps*2] += w2;
+ pD[0+ps*3] += w3;
+ if(m>1)
+ {
+ pD[1+ps*1] += w1 * pD[1+ps*0];
+ pD[1+ps*2] += w2 * pD[1+ps*0];
+ pD[1+ps*3] += w3 * pD[1+ps*0];
+ if(m>2)
+ {
+ pD[2+ps*1] += w1 * pD[2+ps*0];
+ pD[2+ps*2] += w2 * pD[2+ps*0];
+ pD[2+ps*3] += w3 * pD[2+ps*0];
+ if(m>3)
+ {
+ pD[3+ps*1] += w1 * pD[3+ps*0];
+ pD[3+ps*2] += w2 * pD[3+ps*0];
+ pD[3+ps*3] += w3 * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+ }
+ if(m==1)
+ return;
+ // second column
+ beta = 0.0;
+ if(m>2)
+ {
+ tmp = pD[2+ps*1];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*1];
+ beta += tmp*tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ if(m>2)
+ {
+ pD[2+ps*1] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*1] *= tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] *= tmp;
+ pD[1+ii*sdd+ps*1] *= tmp;
+ pD[2+ii*sdd+ps*1] *= tmp;
+ pD[3+ii*sdd+ps*1] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[1+ps*2];
+ w3 = pD[1+ps*3];
+ if(m>2)
+ {
+ w2 += pD[2+ps*2] * pD[2+ps*1];
+ w3 += pD[2+ps*3] * pD[2+ps*1];
+ if(m>3)
+ {
+ w2 += pD[3+ps*2] * pD[3+ps*1];
+ w3 += pD[3+ps*3] * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[1+ps*2] += w2;
+ pD[1+ps*3] += w3;
+ if(m>2)
+ {
+ pD[2+ps*2] += w2 * pD[2+ps*1];
+ pD[2+ps*3] += w3 * pD[2+ps*1];
+ if(m>3)
+ {
+ pD[3+ps*2] += w2 * pD[3+ps*1];
+ pD[3+ps*3] += w3 * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+ }
+ if(m==2)
+ return;
+ // third column
+ beta = 0.0;
+ if(m>3)
+ {
+ tmp = pD[3+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ if(m>3)
+ {
+ pD[3+ps*2] *= tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] *= tmp;
+ pD[1+ii*sdd+ps*2] *= tmp;
+ pD[2+ii*sdd+ps*2] *= tmp;
+ pD[3+ii*sdd+ps*2] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[2+ps*3];
+ if(m>3)
+ {
+ w3 += pD[3+ps*3] * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ w3 = - dD[2] * w3;
+ pD[2+ps*3] += w3;
+ if(m>3)
+ {
+ pD[3+ps*3] += w3 * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+ }
+ if(m==3)
+ return;
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] *= tmp;
+ pD[1+ii*sdd+ps*3] *= tmp;
+ pD[2+ii*sdd+ps*3] *= tmp;
+ pD[3+ii*sdd+ps*3] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp, w0;
+ double *pC00, *pC10, *pC01, *pC11;
+ int offset;
+ double *pD0 = pD-offD;
+ for(ii=0; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ tmp = pC10[1+offset];
+ beta += tmp*tmp;
+ tmp = pC10[2+offset];
+ beta += tmp*tmp;
+ tmp = pC10[3+offset];
+ beta += tmp*tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ pC10[0+offset] *= tmp;
+ pC10[1+offset] *= tmp;
+ pC10[2+offset] *= tmp;
+ pC10[3+offset] *= tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ pC00[0] = beta;
+ }
+ if(ii<n)
+ {
+ pC01 = pC00 + ps;
+ pC11 = pC10 + ps;
+ kmax = jmax;
+ kmax0 = jmax0;
+ jmax = n-ii-1;
+ jj = 0;
+ for( ; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ps*jj] * 1.0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+ w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+ w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ps*jj] += w0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ offset = offset-ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+ pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+ pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 2;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[1+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pD[1+ps*0];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] = tmp;
+ tmp = pC[1+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pD[2+ps*0];
+ d1 = pD[2+ps*1];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] = tmp;
+ tmp = pC[2+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] = tmp;
+ if(m>3)
+ {
+ d0 = pD[3+ps*0];
+ d1 = pD[3+ps*1];
+ d2 = pD[3+ps*2];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] = tmp;
+ tmp = pC[3+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ pC[0+ps*1] -= pW[1+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+ps*0];
+ pW[0+ldw*1] = tmp;
+ if(m>2)
+ {
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+ps*1];
+ pW[0+ldw*2] = tmp;
+ if(m>3)
+ {
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+ps*2];
+ pW[0+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ }
+
+ return;
+ }
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW0)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double c00, c01,
+ c10, c11,
+ c20, c21,
+ c30, c31;
+ double a0, a1, a2, a3, b0, b1;
+ double tmp, d0, d1, d2, d3;
+ double *pC, *pW;
+ double pT[16];// = {};
+ int ldt = 4;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ __m256d
+ _w0, _w1, _w2, _w3, _d0, _t0, _tp, _c0, _c1, _c2, _c3, _a0, _b0, _tz;
+
+ ii = 0;
+#if 1
+ double alpha = 1.0;
+ double beta = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<n-11; ii+=12)
+ {
+ kernel_dgemm_nn_4x12_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+#endif
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_dgemm_nn_4x8_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+ if(ii<n)
+ {
+// kernel_dgemm_nn_4x4_vs_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii], 4, n-ii);
+ kernel_dgemm_nn_4x4_gen_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, 0, &pW0[0+ps*ii], 0, 0, &pW0[0+ps*ii], 0, 0, 4, 0, n-ii);
+ }
+#else
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ _w0 = _mm256_setzero_pd();
+ _w1 = _mm256_setzero_pd();
+ _w2 = _mm256_setzero_pd();
+ _w3 = _mm256_setzero_pd();
+ for(jj=0; jj<m-3; jj+=4)
+ {
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(0+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(1+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(2+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(3+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ _d0 = _mm256_load_pd( &pVt[0+ps*(ll+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ }
+ // TODO mask store
+ _mm256_storeu_pd( &pW[0+ps*0], _w0 );
+ _mm256_storeu_pd( &pW[0+ps*1], _w1 );
+ _mm256_storeu_pd( &pW[0+ps*2], _w2 );
+ _mm256_storeu_pd( &pW[0+ps*3], _w3 );
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ps*0] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ }
+ }
+#endif
+
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+
+ // compute W^T *= T
+ _tz = _mm256_setzero_pd();
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*0] );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _w0 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*1] );
+ _w1 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*2] );
+ _w2 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*3] );
+ _w3 = _mm256_mul_pd( _t0, _tp );
+
+#if defined(TARGET_X64_INTEL_GASWELL)
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+#else
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+#endif
+
+ _mm256_store_pd( &pW[0+ps*0], _w0 );
+ _mm256_store_pd( &pW[0+ps*1], _w1 );
+ _mm256_store_pd( &pW[0+ps*2], _w2 );
+ _mm256_store_pd( &pW[0+ps*3], _w3 );
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+
+ // compute W^T *= T
+ _tz = _mm256_setzero_pd();
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*0] );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _w0 = _mm256_mul_pd( _t0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _mm256_store_pd( &pW[0+ps*0], _w0 );
+ }
+
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ps*1];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ps*1];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ps*1];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c30 -= b0;
+ b1 = pW[3+ps*1];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[0+jj*sdc+ps*1] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[1+jj*sdc+ps*1] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[2+jj*sdc+ps*1] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ }
+ }
+ // load
+ c00 = pC[0+jj*sdc+ps*2];
+ c10 = pC[1+jj*sdc+ps*2];
+ c20 = pC[2+jj*sdc+ps*2];
+ c30 = pC[3+jj*sdc+ps*2];
+ c01 = pC[0+jj*sdc+ps*3];
+ c11 = pC[1+jj*sdc+ps*3];
+ c21 = pC[2+jj*sdc+ps*3];
+ c31 = pC[3+jj*sdc+ps*3];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*2];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ps*3];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*2];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ps*3];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*2];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ps*3];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*2];
+ c30 -= b0;
+ b1 = pW[3+ps*3];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*2] = c00;
+ pC[0+jj*sdc+ps*3] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*2] = c10;
+ pC[1+jj*sdc+ps*3] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*2] = c20;
+ pC[2+jj*sdc+ps*3] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*2] = c30;
+ pC[3+jj*sdc+ps*3] = c31;
+ }
+ }
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c30 -= b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ }
+ }
+ }
+
+#if 1
+ jj = 4;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dger4_sub_12r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+ }
+#endif
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dger4_sub_8r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dger4_sub_4r_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc]);
+ }
+ if(jj<m)
+ {
+ kernel_dger4_sub_4r_vs_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc], m-jj);
+ }
+#else
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ _c0 = _mm256_load_pd( &pC[0+jj*sdc+ps*0] );
+ _c1 = _mm256_load_pd( &pC[0+jj*sdc+ps*1] );
+ _c2 = _mm256_load_pd( &pC[0+jj*sdc+ps*2] );
+ _c3 = _mm256_load_pd( &pC[0+jj*sdc+ps*3] );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*0] );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*2] );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*3] );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ // store
+ _mm256_store_pd( &pC[0+jj*sdc+ps*0], _c0 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*1], _c1 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*2], _c2 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*3], _c3 );
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ c01 = pC[ll+jj*sdc+ps*1];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[0+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[1+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[2+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[3+ps*1];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ pC[ll+jj*sdc+ps*1] = c01;
+ // load
+ c00 = pC[ll+jj*sdc+ps*2];
+ c01 = pC[ll+jj*sdc+ps*3];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[0+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[1+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[2+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[3+ps*3];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*2] = c00;
+ pC[ll+jj*sdc+ps*3] = c01;
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ }
+ }
+#endif
+
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ }
+ return;
+ }
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp;
+ double w00, w01,
+ w10, w11,
+ w20, w21,
+ w30, w31;
+ __m256d
+ _a0, _b0, _t0, _w0, _w1;
+ double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+ double pT[4];
+ int ldt = 2;
+ double *pD0 = pD-offD;
+ ii = 0;
+#if 1 // rank 2
+ for(; ii<imax-1; ii+=2)
+ {
+ // first row
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ kmax = n-ii;
+ w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ // second row
+ pC11 = pC10+ps*1;
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC11[0+ps*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ps*jj] *= tmp;
+ }
+ // compute T
+ kmax = n-ii;
+ tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+ pT[0+ldt*0] = - dD[ii+0];
+ pT[0+ldt*1] = + dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldt*1] = - dD[ii+1];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-2;
+ jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+ pC20 = pC20a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+ w00 = w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ pC20 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ //
+ _w0 = _mm256_load_pd( &pC20[0+ps*0] );
+ _a0 = _mm256_load_pd( &pC20[0+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ _w1 = _mm256_load_pd( &pC20[0+ps*1] );
+ for(kk=2; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w1 = _mm256_add_pd( _w1, _t0 );
+ }
+ //
+ _b0 = _mm256_broadcast_sd( &pT[1+ldt*1] );
+ _w1 = _mm256_mul_pd( _w1, _b0 );
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*1] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _w1 = _mm256_add_pd( _w1, _t0 );
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+ _w0 = _mm256_mul_pd( _w0, _b0 );
+ //
+ _a0 = _mm256_load_pd( &pC20[0+ps*0] );
+ _a0 = _mm256_add_pd( _a0, _w0 );
+ _mm256_store_pd( &pC20[0+ps*0], _a0 );
+ _a0 = _mm256_load_pd( &pC20[0+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _a0 = _mm256_add_pd( _a0, _w1 );
+ _mm256_store_pd( &pC20[0+ps*1], _a0 );
+ for(kk=2; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w1, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _mm256_store_pd( &pC20[0+ps*kk], _a0 );
+ }
+ pC20 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+ w00 = w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ if(ii<n)
+ {
+ // compute T
+ pT[0+ldt*0] = - dD[ii+0];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ pC10 = pC10a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = w00*pT[0+ldt*0];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ pC10 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ //
+ _w0 = _mm256_load_pd( &pC10[0+ps*0] );
+ for(kk=1; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ }
+ //
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+ _w0 = _mm256_mul_pd( _w0, _b0 );
+ //
+ _a0 = _mm256_load_pd( &pC10[0+ps*0] );
+ _a0 = _mm256_add_pd( _a0, _w0 );
+ _mm256_store_pd( &pC10[0+ps*0], _a0 );
+ for(kk=1; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _mm256_store_pd( &pC10[0+ps*kk], _a0 );
+ }
+ pC10 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = w00*pT[0+ldt*0];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+ {
+ const int ps = 4;
+ int kk;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ // 0
+ // 1
+ v10 = pD[0+ps*1];
+ // 2
+ v10 += pD[1+ps*2]*pD[0+ps*2];
+ v20 = pD[0+ps*2];
+ v21 = pD[1+ps*2];
+ // 3
+ v10 += pD[1+ps*3]*pD[0+ps*3];
+ v20 += pD[2+ps*3]*pD[0+ps*3];
+ v21 += pD[2+ps*3]*pD[1+ps*3];
+ v30 = pD[0+ps*3];
+ v31 = pD[1+ps*3];
+ v32 = pD[2+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ v10 += pD[1+ps*kk]*pD[0+ps*kk];
+ v20 += pD[2+ps*kk]*pD[0+ps*kk];
+ v30 += pD[3+ps*kk]*pD[0+ps*kk];
+ v21 += pD[2+ps*kk]*pD[1+ps*kk];
+ v31 += pD[3+ps*kk]*pD[1+ps*kk];
+ v32 += pD[3+ps*kk]*pD[2+ps*kk];
+ }
+ pT[0+ps*0] = - dD[0];
+ pT[1+ps*1] = - dD[1];
+ pT[2+ps*2] = - dD[2];
+ pT[3+ps*3] = - dD[3];
+ pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+ pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+ pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+ pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+ pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+ return;
+ }
+
+
+
+// assume n>=4
+#if ! defined(TARGET_X64_INTEL_HASWELL)
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ return;
+ }
+#endif
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
+
+
+
+
diff --git a/kernel/avx/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..91d1cc0
--- /dev/null
+++ b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1434 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+
+
+ // third column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+
+
+ // fourth column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
+
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>1)
+ {
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+ }
+
+ if(n==2)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // third column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>2)
+ {
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n==3)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // fourth column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>3)
+ {
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
diff --git a/kernel/avx/kernel_dsymv_6_lib4.S b/kernel/avx/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..b55690a
--- /dev/null
+++ b/kernel/avx/kernel_dsymv_6_lib4.S
@@ -0,0 +1,1031 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 128(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm14, %ymm10, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 160(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm14, %ymm11, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+ vmaskmovpd 0(%r14), %ymm14, %ymm13
+
+ vmovupd %ymm14, -32(%rsp) // spill mask to stack
+
+// vmaskmovpd -32(%rsp), %ymm14
+ vmaskmovpd 0(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 32(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 64(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 96(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 128(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm14, %ymm10, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 160(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm14, %ymm11, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd %ymm13, %ymm14, 0(%r14)
+
+ sall $3, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm5, %ymm4, %ymm4
+// vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vextractf128 $0x1, %ymm4, %xmm5
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm4
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm4, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmovupd 32(%r12), %ymm13
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmulpd %ymm15, %ymm13, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %xmm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp_32 rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_6_lib4
+ .type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_6_lib4
+ .def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+ vbroadcastsd 32(%r10), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vbroadcastsd 40(%r10), %ymm11
+ vmulpd %ymm15, %ymm11, %ymm11
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_6_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx/kernel_sgead_lib8.S b/kernel/avx/kernel_sgead_lib8.S
new file mode 100644
index 0000000..4cafa0a
--- /dev/null
+++ b/kernel/avx/kernel_sgead_lib8.S
@@ -0,0 +1,3096 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_0_lib8, @function
+inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r12
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps 64(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_0_gen_lib8, @function
+inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r12), %ymm0
+ vmaskmovps 0(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovups 32(%r12), %ymm0
+ vmaskmovps 32(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r12
+
+ vmovups -64(%r12), %ymm0
+ vmaskmovps 64(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovups -32(%r12), %ymm0
+ vmaskmovps -32(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovups 0(%r12), %ymm0
+ vmaskmovps 0(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_1_lib8, @function
+inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+#if 1
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+#else
+ vmovups 4(%r12), %ymm0
+ vmovups -28(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovups 36(%r12), %ymm0
+ vmovups 4(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovups -60(%r12), %ymm0
+ vmovups -92(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovups -28(%r12), %ymm0
+ vmovups -60(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+#endif
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_1_lib8, .-inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_1_gen_lib8, @function
+inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_1_gen_lib8, .-inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_2_lib8, @function
+inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_2_lib8, .-inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_2_gen_lib8, @function
+inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_2_gen_lib8, .-inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_3_lib8, @function
+inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_3_lib8, .-inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_3_gen_lib8, @function
+inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_3_gen_lib8, .-inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_4_lib8, @function
+inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 48(%r12), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+
+ vmovaps -48(%r12), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %rax
+
+ vmovaps -16(%r12), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 96(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 96(%r14)
+ addq $128, %r14
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_4_lib8, .-inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_4_gen_lib8, @function
+inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 48(%r12), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+
+ vmovaps -48(%r12), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %rax
+
+ vmovaps -16(%r12), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 96(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 96(%r14)
+ addq $128, %r14
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_4_gen_lib8, .-inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_5_lib8, @function
+inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_5_lib8, .-inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_5_gen_lib8, @function
+inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_5_gen_lib8, .-inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_6_lib8, @function
+inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_6_lib8, .-inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_6_gen_lib8, @function
+inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_6_gen_lib8, .-inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_7_lib8, @function
+inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_7_lib8, .-inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_7_gen_lib8, @function
+inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_7_gen_lib8, .-inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_sgead_8_0_lib8(int k, float *alpha, float *A, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_0_lib8
+ .type kernel_sgead_8_0_lib8, @function
+kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_0_lib8
+_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_0_lib8
+ .def kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_0_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_0_lib8, .-kernel_sgead_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_0_gen_lib8
+ .type kernel_sgead_8_0_gen_lib8, @function
+kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_0_gen_lib8
+_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_0_gen_lib8
+ .def kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_0_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_0_gen_lib8, .-kernel_sgead_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_1_lib8
+ .type kernel_sgead_8_1_lib8, @function
+kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_1_lib8
+_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_1_lib8
+ .def kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_1_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_1_lib8, .-kernel_sgead_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_1_gen_lib8
+ .type kernel_sgead_8_1_gen_lib8, @function
+kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_1_gen_lib8
+_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_1_gen_lib8
+ .def kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_1_gen_lib8, .-kernel_sgead_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_2_lib8
+ .type kernel_sgead_8_2_lib8, @function
+kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_2_lib8
+_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_2_lib8
+ .def kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_2_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_2_lib8, .-kernel_sgead_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_2_gen_lib8
+ .type kernel_sgead_8_2_gen_lib8, @function
+kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_2_gen_lib8
+_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_2_gen_lib8
+ .def kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_2_gen_lib8, .-kernel_sgead_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_3_lib8
+ .type kernel_sgead_8_3_lib8, @function
+kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_3_lib8
+_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_3_lib8
+ .def kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_3_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_3_lib8, .-kernel_sgead_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_3_gen_lib8
+ .type kernel_sgead_8_3_gen_lib8, @function
+kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_3_gen_lib8
+_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_3_gen_lib8
+ .def kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_3_gen_lib8, .-kernel_sgead_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_4_lib8
+ .type kernel_sgead_8_4_lib8, @function
+kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_4_lib8
+_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_4_lib8
+ .def kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_4_lib8, .-kernel_sgead_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_4_gen_lib8
+ .type kernel_sgead_8_4_gen_lib8, @function
+kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_4_gen_lib8
+_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_4_gen_lib8
+ .def kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_4_gen_lib8, .-kernel_sgead_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_5_lib8
+ .type kernel_sgead_8_5_lib8, @function
+kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_5_lib8
+_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_5_lib8
+ .def kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_5_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_5_lib8, .-kernel_sgead_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_5_gen_lib8
+ .type kernel_sgead_8_5_gen_lib8, @function
+kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_5_gen_lib8
+_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_5_gen_lib8
+ .def kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_5_gen_lib8, .-kernel_sgead_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_6_lib8
+ .type kernel_sgead_8_6_lib8, @function
+kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_6_lib8
+_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_6_lib8
+ .def kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_6_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_6_lib8, .-kernel_sgead_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_6_gen_lib8
+ .type kernel_sgead_8_6_gen_lib8, @function
+kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_6_gen_lib8
+_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_6_gen_lib8
+ .def kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_6_gen_lib8, .-kernel_sgead_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_7_lib8
+ .type kernel_sgead_8_7_lib8, @function
+kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_7_lib8
+_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_7_lib8
+ .def kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_7_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_7_lib8, .-kernel_sgead_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_7_gen_lib8
+ .type kernel_sgead_8_7_gen_lib8, @function
+kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_7_gen_lib8
+_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_7_gen_lib8
+ .def kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_7_gen_lib8, .-kernel_sgead_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgecp_lib8.S b/kernel/avx/kernel_sgecp_lib8.S
new file mode 100644
index 0000000..5cd2c00
--- /dev/null
+++ b/kernel/avx/kernel_sgecp_lib8.S
@@ -0,0 +1,2796 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_0_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_0_lib8, @function
+inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_lib8:
+#endif
+#endif
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps %ymm0, 32(%r12)
+ addq $128, %r11
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps %ymm0, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps %ymm0, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_0_gen_lib8, @function
+inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $4, %r10d
+
+ vmovups 32(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r12)
+ addq $128, %r11
+
+ vmovups -64(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r12)
+ addq $128, %r12
+
+ vmovups -32(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovups 0(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_1_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_1_lib8, @function
+inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+#if 1
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+#else
+ vmovups 4(%r11), %ymm0
+ vmovups -28(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovups 36(%r11), %ymm0
+ vmovups 4(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovups -60(%r11), %ymm0
+ vmovups -92(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovups -28(%r11), %ymm0
+ vmovups -60(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r13)
+#endif
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_1_lib8, .-inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_1_gen_lib8, @function
+inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_1_gen_lib8, .-inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_2_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_2_lib8, @function
+inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_2_lib8, .-inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_2_gen_lib8, @function
+inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_2_gen_lib8, .-inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_3_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_3_lib8, @function
+inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_3_lib8, .-inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_3_gen_lib8, @function
+inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_3_gen_lib8, .-inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_4_lib8, @function
+inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 48(%r11), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+
+ vmovaps -48(%r11), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %rax
+
+ vmovaps -16(%r11), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 96(%r13)
+ addq $128, %r13
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_4_lib8, .-inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_4_gen_lib8, @function
+inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 48(%r11), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+
+ vmovaps -48(%r11), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %rax
+
+ vmovaps -16(%r11), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 96(%r13)
+ addq $128, %r13
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_4_gen_lib8, .-inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_5_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_5_lib8, @function
+inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_5_lib8, .-inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_5_gen_lib8, @function
+inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_5_gen_lib8, .-inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_6_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_6_lib8, @function
+inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_6_lib8, .-inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_6_gen_lib8, @function
+inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_6_gen_lib8, .-inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_7_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_7_lib8, @function
+inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_7_lib8, .-inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_7_gen_lib8, @function
+inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_7_gen_lib8, .-inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx
+// void kernel_sgecp_8_0_lib8(int k, float *A, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_0_lib8
+ .type kernel_sgecp_8_0_lib8, @function
+kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_0_lib8
+_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_0_lib8
+ .def kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_0_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_0_lib8, .-kernel_sgecp_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_0_gen_lib8
+ .type kernel_sgecp_8_0_gen_lib8, @function
+kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_0_gen_lib8
+_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_0_gen_lib8
+ .def kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_0_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_0_gen_lib8, .-kernel_sgecp_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_1_lib8
+ .type kernel_sgecp_8_1_lib8, @function
+kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_1_lib8
+_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_1_lib8
+ .def kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_1_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_1_lib8, .-kernel_sgecp_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_1_gen_lib8
+ .type kernel_sgecp_8_1_gen_lib8, @function
+kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_1_gen_lib8
+_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_1_gen_lib8
+ .def kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_1_gen_lib8, .-kernel_sgecp_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_2_lib8
+ .type kernel_sgecp_8_2_lib8, @function
+kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_2_lib8
+_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_2_lib8
+ .def kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_2_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_2_lib8, .-kernel_sgecp_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_2_gen_lib8
+ .type kernel_sgecp_8_2_gen_lib8, @function
+kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_2_gen_lib8
+_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_2_gen_lib8
+ .def kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_2_gen_lib8, .-kernel_sgecp_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_3_lib8
+ .type kernel_sgecp_8_3_lib8, @function
+kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_3_lib8
+_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_3_lib8
+ .def kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_3_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_3_lib8, .-kernel_sgecp_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_3_gen_lib8
+ .type kernel_sgecp_8_3_gen_lib8, @function
+kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_3_gen_lib8
+_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_3_gen_lib8
+ .def kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_3_gen_lib8, .-kernel_sgecp_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_4_lib8
+ .type kernel_sgecp_8_4_lib8, @function
+kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_4_lib8
+_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_4_lib8
+ .def kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_4_lib8, .-kernel_sgecp_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_4_gen_lib8
+ .type kernel_sgecp_8_4_gen_lib8, @function
+kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_4_gen_lib8
+_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_4_gen_lib8
+ .def kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_4_gen_lib8, .-kernel_sgecp_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_5_lib8
+ .type kernel_sgecp_8_5_lib8, @function
+kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_5_lib8
+_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_5_lib8
+ .def kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_5_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_5_lib8, .-kernel_sgecp_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_5_gen_lib8
+ .type kernel_sgecp_8_5_gen_lib8, @function
+kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_5_gen_lib8
+_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_5_gen_lib8
+ .def kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_5_gen_lib8, .-kernel_sgecp_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_6_lib8
+ .type kernel_sgecp_8_6_lib8, @function
+kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_6_lib8
+_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_6_lib8
+ .def kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_6_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_6_lib8, .-kernel_sgecp_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_6_gen_lib8
+ .type kernel_sgecp_8_6_gen_lib8, @function
+kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_6_gen_lib8
+_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_6_gen_lib8
+ .def kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_6_gen_lib8, .-kernel_sgecp_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_7_lib8
+ .type kernel_sgecp_8_7_lib8, @function
+kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_7_lib8
+_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_7_lib8
+ .def kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_7_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_7_lib8, .-kernel_sgecp_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_7_gen_lib8
+ .type kernel_sgecp_8_7_gen_lib8, @function
+kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_7_gen_lib8
+_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_7_gen_lib8
+ .def kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_7_gen_lib8, .-kernel_sgecp_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_16x4_lib8.S b/kernel/avx/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..5c2d6c4
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,7057 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// 8 A0
+// 9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+// vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// 8 A0
+// 9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+// vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $8, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 1) // software prefetch
+ prefetcht0 64(%r13, %r14, 1) // software prefetch
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss -4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq %r14, %r13
+
+ cmpl $8, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $7, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 124(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq %r14, %r13
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm14 // B[0]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%r13), %ymm14 // B[1]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 64(%r13), %ymm14 // B[2]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm14 // B[3]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r15d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm0, %ymm0
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm4, %ymm4
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm1, %ymm1
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm5, %ymm5
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm2, %ymm2
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm3, %ymm3
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // end-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r13, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r15d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r15d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movl $0, %r15d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r15d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r15d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vextractf128 $0x1, %ymm0, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm1, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // C1 <- C0
+ addq %r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r15)
+ vmovaps %ymm5, 32(%r15)
+ vmovaps %ymm6, 64(%r15)
+ vmovaps %ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 7f // end
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 7f // end
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 7f // end
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+ jmp 0f
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute D1
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ cmpl $2, %r15d
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%rbx)
+ jl 7f // end
+ cmpl $3, %r15d
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%rbx)
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%rbx)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%rbx)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbp // D1
+ addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vblendps $0x03, %ymm13, %ymm2, %ymm2
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm12
+ vmovaps 32(%r10), %ymm13
+ vmovaps 64(%r10), %ymm14
+ vmovaps 96(%r10), %ymm15
+
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vblendps $0x1f, %ymm13, %ymm1, %ymm1
+ vblendps $0x3f, %ymm14, %ymm2, %ymm2
+ vblendps $0x7f, %ymm15, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmovaps 0(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_lib8
+ .type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_lib8
+ .type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_lib8
+ .type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // m1
+ movq ARG12, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdd*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x4_lib8.S b/kernel/avx/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..d319a83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,6673 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+// vbroadcastf128 128(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 32(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+// vbroadcastf128 128(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 32(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %r15d
+ subl %r14d, %r15d // 8-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r12, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r14d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r14d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movl $0, %r14d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r14d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r14d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm4
+ vmulps %ymm1, %ymm15, %ymm5
+ vmulps %ymm2, %ymm15, %ymm6
+ vmulps %ymm3, %ymm15, %ymm7
+
+ // transpose
+ vblendps $0xaa, %ymm5, %ymm4, %ymm0
+ vblendps $0xaa, %ymm5, %ymm5, %ymm1
+ vblendps $0xaa, %ymm6, %ymm7, %ymm2
+ vblendps $0xaa, %ymm7, %ymm6, %ymm3
+
+ vunpcklps %ymm1, %ymm0, %ymm4
+ vunpckhps %ymm1, %ymm0, %ymm5
+ vunpcklps %ymm3, %ymm2, %ymm6
+ vunpckhps %ymm3, %ymm2, %ymm7
+
+ vunpcklpd %ymm5, %ymm7, %ymm2
+ vunpckhpd %ymm5, %ymm7, %ymm3
+ vunpcklpd %ymm6, %ymm4, %ymm0
+ vunpckhpd %ymm6, %ymm4, %ymm1
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm0, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm1, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm2, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm3, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm4, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm5, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm6, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm7, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm4
+ vmulps %ymm1, %ymm15, %ymm5
+ vmulps %ymm2, %ymm15, %ymm6
+ vmulps %ymm3, %ymm15, %ymm7
+
+ // transpose
+ vblendps $0xaa, %ymm5, %ymm4, %ymm0
+ vblendps $0xaa, %ymm5, %ymm5, %ymm1
+ vblendps $0xaa, %ymm6, %ymm7, %ymm2
+ vblendps $0xaa, %ymm7, %ymm6, %ymm3
+
+ vunpcklps %ymm1, %ymm0, %ymm4
+ vunpckhps %ymm1, %ymm0, %ymm5
+ vunpcklps %ymm3, %ymm2, %ymm6
+ vunpckhps %ymm3, %ymm2, %ymm7
+
+ vunpcklpd %ymm5, %ymm7, %ymm2
+ vunpckhpd %ymm5, %ymm7, %ymm3
+ vunpcklpd %ymm6, %ymm4, %ymm0
+ vunpckhpd %ymm6, %ymm4, %ymm1
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm0, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm1, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm2, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm3, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm4, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm5, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm6, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm7, %xmm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %r15 // C0
+ addq %r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+
+ vmovaps %xmm0, 0(%r10)
+ vmovaps %xmm1, 32(%r10)
+ vmovaps %xmm2, 64(%r10)
+ vmovaps %xmm3, 96(%r10)
+ vmovaps %xmm4, 128(%r10)
+ vmovaps %xmm5, 160(%r10)
+ vmovaps %xmm6, 192(%r10)
+ vmovaps %xmm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm12, %ymm14
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %ymm1, %ymm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r10)
+ je 0f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm12, %xmm14
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %xmm1, %xmm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %xmm2, %xmm14, 64(%r10)
+ cmpl $4, %r12d
+ jl 0f // end
+ vmaskmovps %xmm3, %xmm14, 96(%r10)
+ cmpl $5, %r12d
+ jl 0f // end
+ vmaskmovps %xmm4, %xmm14, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %xmm5, %xmm14, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %xmm6, %xmm14, 192(%r10)
+ je 0f // end
+ vmaskmovps %xmm7, %xmm14, 224(%r10)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ vmovaps %xmm7, %xmm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ addq $32, %r11
+
+ cmpl $3, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ addq $32, %r11
+
+ cmpl $4, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ addq $32, %r11
+
+ cmpl $5, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ addq $32, %r11
+
+ cmpl $6, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %xmm1, %xmm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %xmm2, %xmm15, 64(%r11)
+ cmpl $4, %r15d
+ jl 7f // end
+ vmaskmovps %xmm3, %xmm15, 96(%r11)
+ cmpl $5, %r15d
+ jl 7f // end
+ vmaskmovps %xmm4, %xmm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %xmm5, %xmm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %xmm6, %xmm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %xmm7, %xmm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vblendps $0x3, %ymm13, %ymm2, %ymm2
+ vblendps $0x7, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ //
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x8_lib8.S b/kernel/avx/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..354fa83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5514 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+// vbroadcastf128 0(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 16(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+// vbroadcastf128 0(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 16(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+ prefetcht0 128(%r14) // software prefetch
+ prefetcht0 192(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 132(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 164(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 196(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 228(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 136(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 168(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 200(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 232(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 140(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 172(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 204(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 236(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 144(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 176(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 208(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 240(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 148(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 180(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 212(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 244(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 152(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 184(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 216(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 248(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 156(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 188(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 220(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 252(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r14d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 16(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 20(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 24(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 28(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 48(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 52(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 56(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 60(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 80(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 84(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 88(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 92(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vbroadcastss 112(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 116(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 120(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 124(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 16(%r11), %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r12d
+ jl 0f // ret
+ vbroadcastss 148(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 152(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 156(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 20(%r11), %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r12d
+ jl 0f // ret
+ vbroadcastss 184(%r10), %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 188(%r10), %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 24(%r11), %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r12d
+ jl 0f // ret
+ vbroadcastss 220(%r10), %ymm13
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 28(%r11), %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm4, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 9f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+10:
+ vmovsd %xmm13, 16(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm5, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 11f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+12:
+ vmovsd %xmm13, 20(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm6, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 13f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+14:
+ vmovsd %xmm13, 24(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm7, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 15f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+16:
+ vmovsd %xmm13, 28(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+9:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 10b
+
+11:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 12b
+
+13:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 14b
+
+15:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 16b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r10), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r10), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r10), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r10), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r11), %ymm12
+ vaddps %ymm4, %ymm12, %ymm4
+ vmovaps 160(%r11), %ymm12
+ vaddps %ymm5, %ymm12, %ymm5
+ vmovaps 192(%r11), %ymm12
+ vaddps %ymm6, %ymm12, %ymm6
+ vmovaps 224(%r11), %ymm12
+ vaddps %ymm7, %ymm12, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm4, 128(%r10)
+ vmovaps %ymm5, 160(%r10)
+ vmovaps %ymm6, 192(%r10)
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps 32(%r10), %ymm14
+ vblendps $0x01, %ymm14, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps 64(%r10), %ymm14
+ vblendps $0x03, %ymm14, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps 96(%r10), %ymm14
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps 128(%r10), %ymm14
+ vblendps $0x0f, %ymm14, %ymm4, %ymm4
+ vmovaps %ymm4, 128(%r10)
+ vmovaps 160(%r10), %ymm14
+ vblendps $0x1f, %ymm14, %ymm5, %ymm5
+ vmovaps %ymm5, 160(%r10)
+ vmovaps 192(%r10), %ymm14
+ vblendps $0x3f, %ymm14, %ymm6, %ymm6
+ vmovaps %ymm6, 192(%r10)
+ vmovaps 224(%r10), %ymm14
+ vblendps $0x7f, %ymm14, %ymm7, %ymm7
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmovaps 128(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmovaps 160(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmovaps 192(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmovaps 224(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmovaps 128(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmovaps 160(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmovaps 192(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmovaps 224(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movl $8, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // m1
+ movq ARG9, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_diag_lib8.c b/kernel/avx/kernel_sgemm_diag_lib8.c
new file mode 100644
index 0000000..63183b2
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_diag_lib8.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+ b_33 = _mm256_broadcast_ss( &B[3] );
+ b_33 = _mm256_mul_ps( b_33, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+ _mm256_store_ps( &D[24], d_03 );
+
+ A += 8*sda;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+ _mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+ b_33 = _mm256_broadcast_ss( &B[3] );
+ b_33 = _mm256_mul_ps( b_33, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+ c_00 = _mm256_load_ps( &C[24] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_03 = _mm256_add_ps( c_00, d_03 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+ _mm256_store_ps( &D[24], d_03 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+ c_00 = _mm256_load_ps( &C[24] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_03 = _mm256_add_ps( c_00, d_03 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+ _mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22,
+ c_00,
+ d_00, d_01, d_02;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11,
+ c_00,
+ d_00, d_01;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+
+ _mm256_store_ps( &D[0], d_00 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+
+ }
+
+ }
+
+
+
+
diff --git a/kernel/avx/kernel_sgemv_4_lib8.S b/kernel/avx/kernel_sgemv_4_lib8.S
new file mode 100644
index 0000000..1508ebe
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_4_lib8.S
@@ -0,0 +1,2935 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_t_4_lib8, @function
+inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmaskmovps 0(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmaskmovps 32(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmaskmovps 64(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmaskmovps 96(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ sall $2, %r10d // *sizeof(float)
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_t_4_lib8, .-inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_nt_4_lib8, @function
+inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_nt_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_nt_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+ vmovups 0(%r14), %ymm13
+
+ vmovaps 0(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovaps 64(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovaps 96(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovups %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm11
+
+ vmaskmovps 0(%r13), %ymm11, %ymm12
+ vmaskmovps 0(%r14), %ymm11, %ymm13
+
+// vmovups %ymm14, -32(%rsp) // spill mask to stack
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 0(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 32(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 64(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 96(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps %ymm13, %ymm11, 0(%r14)
+
+ sall $2, %r10d // *sizeof(float)
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_nt_4_lib8, .-inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemv_add_t_4_lib8, @function
+inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $2, %r15d // offA*sizeof(float)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2ss %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm13, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+ vandps %ymm15, %ymm14, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $8, %r10d // kmax - (8-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemv_add_t_4_lib8, .-inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ cmpl $2, %r13d
+ jl 1f
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ cmpl $3, %r13d
+ jl 2f
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ cmpl $4, %r13d
+ jl 3f
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ cmpl $5, %r13d
+ jl 4f
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ cmpl $6, %r13d
+ jl 5f
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ cmpl $7, %r13d
+ jl 6f
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ cmpl $8, %r13d
+ jl 7f
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+ jmp 0f
+
+
+
+ vmovaps %ymm14, %ymm12
+1:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+2:
+ vmovaps %ymm14, %ymm12
+3:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ jmp 8f
+
+4:
+ vmovaps %xmm14, %xmm12
+5:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+6:
+ vmovaps %xmm14, %xmm12
+7:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+8:
+
+ vmovaps %xmm14, %xmm11
+ vmovaps %xmm14, %xmm12
+ vmovaps %xmm14, %xmm13
+
+0:
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ cmpl $8, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $7, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $6, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $5, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $4, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $3, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $2, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $1, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+0:
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_symv_add_nt_4l_lib8, @function
+inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_symv_add_nt_4l_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4l_lib8:
+#endif
+#endif
+
+ movl $8, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2ss %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm11
+
+ vmaskmovps 0(%r13), %ymm11, %ymm12
+ vmaskmovps 0(%r14), %ymm11, %ymm13
+
+ vmaskmovps 0(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x01, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 32(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x01, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x03, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 64(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x03, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x07, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 96(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x07, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x0f, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps %ymm13, %ymm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $2, %rax // *sizeof(float)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_symv_add_nt_4l_lib8, .-inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+
+
+
+
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_symv_add_nt_4r_lib8, @function
+inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_symv_add_nt_4r_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4r_lib8:
+#endif
+#endif
+
+ movl $4, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2ss %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+// vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm13, %xmm11
+
+ vmaskmovps 0(%r13), %xmm11, %xmm12
+ vmaskmovps 0(%r14), %xmm11, %xmm13
+
+ vmaskmovps 0(%r11), %xmm11, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm0, %xmm15, %xmm0
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x01, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm6, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 32(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x01, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm1, %xmm15, %xmm1
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x03, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm7, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 64(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x03, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm2, %xmm15, %xmm2
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x07, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm8, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 96(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x07, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm3, %xmm15, %xmm3
+// vxorps %xmm15, %xmm15, %xmm15
+// vblendps $0x0f, %xmm15, %xmm14, %xmm14
+// vmulps %xmm14, %xmm9, %xmm15
+// vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps %xmm13, %xmm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $2, %rax // *sizeof(float)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_symv_add_nt_4r_lib8, .-inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_4_lib8, @function
+inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // alpha
+ vbroadcastss 0(%r10), %xmm15
+ vmulps %xmm0, %xmm15, %xmm0
+
+ // beta
+ vbroadcastss 0(%r11), %xmm15
+ vmovups 0(%r12), %xmm14
+ vmulps %xmm15, %xmm14, %xmm14
+ vaddps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_4_lib8, .-inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib8, @function
+inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // alpha
+ vbroadcastss 0(%r10), %xmm15
+ vmulps %xmm0, %xmm15, %xmm0
+
+ // beta
+ vmovups 0(%r11), %xmm14
+ vaddps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib8, .-inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_4_lib8, @function
+inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // beta
+ vmovups 0(%r10), %xmm14
+ vsubps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_4_lib8, .-inner_blend_t_scale_m11_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_lib8, @function
+inner_store_4_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_lib8; .scl 2; .type 32; .endef
+inner_store_4_lib8:
+#endif
+#endif
+
+ vmovups %xmm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_lib8, .-inner_store_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_vs_lib8, @function
+inner_store_4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm14
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm14
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+// vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm15, %xmm14, %xmm15
+
+ vmaskmovps %xmm0, %xmm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_vs_lib8, .-inner_store_4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_gen_lib8, @function
+inner_store_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+// vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+// vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ vmaskmovps %xmm0, %xmm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_gen_lib8, .-inner_store_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_t_4_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_lib8
+ .type kernel_sgemv_t_4_lib8, @function
+kernel_sgemv_t_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_lib8
+_kernel_sgemv_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_lib8
+ .def kernel_sgemv_t_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_lib8, .-kernel_sgemv_t_4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_t_4_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_vs_lib8
+ .type kernel_sgemv_t_4_vs_lib8, @function
+kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_vs_lib8
+_kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_vs_lib8
+ .def kernel_sgemv_t_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_vs_lib8, .-kernel_sgemv_t_4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemv_t_4_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_gen_lib8
+ .type kernel_sgemv_t_4_gen_lib8, @function
+kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_gen_lib8
+_kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_gen_lib8
+ .def kernel_sgemv_t_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_gen_lib8, .-kernel_sgemv_t_4_gen_lib8
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// 1 2 3 4 5 6 7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG8, %r12 // km
+ movq ARG9, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemv_nt_4_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_nt_4_lib8
+ .type kernel_sgemv_nt_4_lib8, @function
+kernel_sgemv_nt_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_nt_4_lib8
+_kernel_sgemv_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_nt_4_lib8
+ .def kernel_sgemv_nt_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_nt_4_lib8, .-kernel_sgemv_nt_4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemv_nt_4_vs_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_nt_4_vs_lib8
+ .type kernel_sgemv_nt_4_vs_lib8, @function
+kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_nt_4_vs_lib8
+_kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_nt_4_vs_lib8
+ .def kernel_sgemv_nt_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG12, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+ movq ARG12, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_nt_4_vs_lib8, .-kernel_sgemv_nt_4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4l_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4l_lib8
+ .type kernel_ssymv_l_4l_lib8, @function
+kernel_ssymv_l_4l_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4l_lib8
+_kernel_ssymv_l_4l_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4l_lib8
+ .def kernel_ssymv_l_4l_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+ movq $0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4l_lib8, .-kernel_ssymv_l_4l_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4r_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4r_lib8
+ .type kernel_ssymv_l_4r_lib8, @function
+kernel_ssymv_l_4r_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4r_lib8
+_kernel_ssymv_l_4r_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4r_lib8
+ .def kernel_ssymv_l_4r_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+ movq $0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4r_lib8, .-kernel_ssymv_l_4r_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dsymv_l_4l_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4l_gen_lib8
+ .type kernel_ssymv_l_4l_gen_lib8, @function
+kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4l_gen_lib8
+_kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4l_gen_lib8
+ .def kernel_ssymv_l_4l_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG8, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4l_gen_lib8, .-kernel_ssymv_l_4l_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dsymv_l_4r_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4r_gen_lib8
+ .type kernel_ssymv_l_4r_gen_lib8, @function
+kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4r_gen_lib8
+_kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4r_gen_lib8
+ .def kernel_ssymv_l_4r_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG8, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4r_gen_lib8, .-kernel_ssymv_l_4r_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .float 0.5
+ .float 1.5
+ .float 2.5
+ .float 3.5
+ .float 4.5
+ .float 5.5
+ .float 6.5
+ .float 7.5
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_sgemv_8_lib8.S b/kernel/avx/kernel_sgemv_8_lib8.S
new file mode 100644
index 0000000..aafd8cb
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_8_lib8.S
@@ -0,0 +1,2837 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_n_8_lib8, @function
+inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_n_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_n_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vbroadcastss 4(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vbroadcastss 8(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vbroadcastss 12(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $16, %r12
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ addq $32, %r11
+ addq $4, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_n_8_lib8, .-inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_t_8_lib8, @function
+inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 128(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmovaps 160(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmovaps 192(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmovaps 224(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmaskmovps 0(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmaskmovps 32(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmaskmovps 64(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmaskmovps 96(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmaskmovps 128(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmaskmovps 160(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmaskmovps 192(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmaskmovps 224(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ sall $2, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_t_8_lib8, .-inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemv_add_t_8_lib8, @function
+inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $2, %r15d // offA*sizeof(float)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2ss %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm13, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+ vandps %ymm15, %ymm14, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 128(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmovaps 160(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmovaps 192(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmovaps 224(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $8, %r10d // kmax - (8-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemv_add_t_8_lib8, .-inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_ln_inv_8_lib8, @function
+inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vbroadcastss 0(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+
+ vmovaps 0(%r10), %ymm13
+ vblendps $0x01, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 4(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x02, %ymm1, %ymm0, %ymm0
+
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 8(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x04, %ymm1, %ymm0, %ymm0
+
+ vmovaps 64(%r10), %ymm13
+ vblendps $0x07, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 12(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x08, %ymm1, %ymm0, %ymm0
+
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xff, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 16(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x10, %ymm1, %ymm0, %ymm0
+
+ vmovaps 128(%r10), %ymm13
+ vblendps $0x1f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 20(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x20, %ymm1, %ymm0, %ymm0
+
+ vmovaps 160(%r10), %ymm13
+ vblendps $0x3f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 24(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x40, %ymm1, %ymm0, %ymm0
+
+ vmovaps 192(%r10), %ymm13
+ vblendps $0x7f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 28(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_ln_inv_8_lib8, .-inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_ln_inv_8_vs_lib8, @function
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vbroadcastss 0(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vmovaps 0(%r10), %ymm13
+ vblendps $0x01, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastss 4(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x02, %ymm1, %ymm0, %ymm0
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastss 8(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x04, %ymm1, %ymm0, %ymm0
+ vmovaps 64(%r10), %ymm13
+ vblendps $0x07, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastss 12(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x08, %ymm1, %ymm0, %ymm0
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xff, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $5, %r12d
+ jl 0f // ret
+
+ vbroadcastss 16(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x10, %ymm1, %ymm0, %ymm0
+ vmovaps 128(%r10), %ymm13
+ vblendps $0x1f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $6, %r12d
+ jl 0f // ret
+
+ vbroadcastss 20(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x20, %ymm1, %ymm0, %ymm0
+ vmovaps 160(%r10), %ymm13
+ vblendps $0x3f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $7, %r12d
+ jl 0f // ret
+
+ vbroadcastss 24(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x40, %ymm1, %ymm0, %ymm0
+ vmovaps 192(%r10), %ymm13
+ vblendps $0x7f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $8, %r12d
+ jl 0f // ret
+
+ vbroadcastss 28(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_ln_inv_8_vs_lib8, .-inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// r14 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// r14 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmovups 0(%r14), %ymm15
+ vblendvps %ymm14, %ymm0, %ymm15, %ymm0
+
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ cmpl $2, %r13d
+ jl 1f
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ cmpl $3, %r13d
+ jl 2f
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ cmpl $4, %r13d
+ jl 3f
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ cmpl $5, %r13d
+ jl 4f
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ cmpl $6, %r13d
+ jl 5f
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ cmpl $7, %r13d
+ jl 6f
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ cmpl $8, %r13d
+ jl 7f
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+ jmp 0f
+
+
+
+ vmovaps %ymm14, %ymm12
+1:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+2:
+ vmovaps %ymm14, %ymm12
+3:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ jmp 8f
+
+4:
+ vmovaps %xmm14, %xmm12
+5:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+6:
+ vmovaps %xmm14, %xmm12
+7:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+8:
+
+ vmovaps %xmm14, %xmm11
+ vmovaps %xmm14, %xmm12
+ vmovaps %xmm14, %xmm13
+
+0:
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ cmpl $8, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ cmpl $8, %r13d
+ jl 1f
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $7, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ cmpl $7, %r13d
+ jl 1f
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $6, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ cmpl $6, %r13d
+ jl 1f
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $5, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ cmpl $5, %r13d
+ jl 1f
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $4, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ cmpl $4, %r13d
+ jl 1f
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $3, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ cmpl $3, %r13d
+ jl 1f
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $2, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ cmpl $2, %r13d
+ jl 1f
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $1, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ cmpl $1, %r13d
+ jl 1f
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+1:
+
+0:
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib8, @function
+inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vaddps %ymm0, %ymm1, %ymm0
+ vaddps %ymm2, %ymm3, %ymm2
+ vaddps %ymm0, %ymm2, %ymm0
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+ vmulps %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+ vmovups 0(%r12), %ymm14
+ vmulps %ymm15, %ymm14, %ymm14
+ vaddps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib8, .-inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_m11_8_lib8, @function
+inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vaddps %ymm0, %ymm1, %ymm0
+ vaddps %ymm2, %ymm3, %ymm2
+ vaddps %ymm0, %ymm2, %ymm0
+
+ // beta
+ vmovups 0(%r10), %ymm14
+ vsubps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_m11_8_lib8, .-inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib8, @function
+inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+ vhaddps %ymm5, %ymm4, %ymm4
+ vhaddps %ymm7, %ymm6, %ymm6
+
+ vhaddps %ymm2, %ymm0, %ymm0
+ vhaddps %ymm6, %ymm4, %ymm4
+
+ vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
+
+ vaddps %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+ vmulps %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+ vmovups 0(%r12), %ymm14
+ vmulps %ymm15, %ymm14, %ymm14
+ vaddps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib8, .-inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_8_lib8, @function
+inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+ vhaddps %ymm5, %ymm4, %ymm4
+ vhaddps %ymm7, %ymm6, %ymm6
+
+ vhaddps %ymm2, %ymm0, %ymm0
+ vhaddps %ymm6, %ymm4, %ymm4
+
+ vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
+
+ vaddps %ymm0, %ymm1, %ymm0
+
+ // beta
+ vmovups 0(%r10), %ymm14
+ vsubps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_8_lib8, .-inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib8, @function
+inner_store_8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib8; .scl 2; .type 32; .endef
+inner_store_8_lib8:
+#endif
+#endif
+
+ vmovups %ymm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib8, .-inner_store_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_vs_lib8, @function
+inner_store_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm14
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm14, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_vs_lib8, .-inner_store_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_gen_lib8, @function
+inner_store_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_gen_lib8, .-inner_store_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_lib8
+ .type kernel_sgemv_n_8_lib8, @function
+kernel_sgemv_n_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_lib8
+_kernel_sgemv_n_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_lib8
+ .def kernel_sgemv_n_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_lib8, .-kernel_sgemv_n_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_vs_lib8
+ .type kernel_sgemv_n_8_vs_lib8, @function
+kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_vs_lib8
+_kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_vs_lib8
+ .def kernel_sgemv_n_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_vs_lib8, .-kernel_sgemv_n_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_gen_lib8
+ .type kernel_sgemv_n_8_gen_lib8, @function
+kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_gen_lib8
+_kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_gen_lib8
+ .def kernel_sgemv_n_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+ movq ARG9, %r12 // k2
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_gen_lib8, .-kernel_sgemv_n_8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_t_8_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_lib8
+ .type kernel_sgemv_t_8_lib8, @function
+kernel_sgemv_t_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_lib8
+_kernel_sgemv_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_lib8
+ .def kernel_sgemv_t_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_lib8, .-kernel_sgemv_t_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_vs_lib8
+ .type kernel_sgemv_t_8_vs_lib8, @function
+kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_vs_lib8
+_kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_vs_lib8
+ .def kernel_sgemv_t_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_vs_lib8, .-kernel_sgemv_t_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemv_t_8_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_gen_lib8
+ .type kernel_sgemv_t_8_gen_lib8, @function
+kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_gen_lib8
+_kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_gen_lib8
+ .def kernel_sgemv_t_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_gen_lib8, .-kernel_sgemv_t_8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_ln_inv_8_lib8
+ .type kernel_strsv_ln_inv_8_lib8, @function
+kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_ln_inv_8_lib8
+_kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_ln_inv_8_lib8
+ .def kernel_strsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_ln_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_ln_inv_8_lib8, .-kernel_strsv_ln_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_ln_inv_8_vs_lib8
+ .type kernel_strsv_ln_inv_8_vs_lib8, @function
+kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_ln_inv_8_vs_lib8
+_kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_ln_inv_8_vs_lib8
+ .def kernel_strsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_ln_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+ movq ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_ln_inv_8_vs_lib8, .-kernel_strsv_ln_inv_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG8, %r12 // km
+ movq ARG9, %r13 // kn
+ movq ARG5, %r14 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG9, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .float 0.5
+ .float 1.5
+ .float 2.5
+ .float 3.5
+ .float 4.5
+ .float 5.5
+ .float 6.5
+ .float 7.5
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgesc_lib8.S b/kernel/avx/kernel_sgesc_lib8.S
new file mode 100644
index 0000000..43ff708
--- /dev/null
+++ b/kernel/avx/kernel_sgesc_lib8.S
@@ -0,0 +1,506 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- alpha
+// r12 <- A
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGESC_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgesc_8_lib8, @function
+inner_kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r12)
+
+ vmovaps 64(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r12)
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- alpha
+// r12 <- A
+// r13d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgesc_8_gen_lib8, @function
+inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vbroadcastss 0(%r11), %ymm14
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r12)
+
+ vmovaps 64(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r12)
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx
+// void kernel_sgesc_8_lib8(int k, float *alpha, float *A);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgesc_8_lib8
+ .type kernel_sgesc_8_lib8, @function
+kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgesc_8_lib8
+_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgesc_8_lib8
+ .def kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGESC_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgesc_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgesc_8_lib8, .-kernel_sgesc_8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_gen_lib8(int k, float *alpha, float *A, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgesc_8_gen_lib8
+ .type kernel_sgesc_8_gen_lib8, @function
+kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgesc_8_gen_lib8
+_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgesc_8_gen_lib8
+ .def kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgesc_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgesc_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgesc_8_gen_lib8, .-kernel_sgesc_8_gen_lib8
+#endif
+
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgetr_lib8.S b/kernel/avx/kernel_sgetr_lib8.S
new file mode 100644
index 0000000..745c42e
--- /dev/null
+++ b/kernel/avx/kernel_sgetr_lib8.S
@@ -0,0 +1,2476 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGETR_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgetr_8_lib8, @function
+inner_kernel_sgetr_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgetr_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $7, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ subl $8, %r10d
+ addq %r12, %r11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 0(%r13)
+ vmovaps %ymm3, 128(%r13)
+ vshufps $0xee, %ymm10, %ymm8, %ymm0
+ vshufps $0xee, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 32(%r13)
+ vmovaps %ymm3, 160(%r13)
+ vshufps $0x44, %ymm11, %ymm9, %ymm0
+ vshufps $0x44, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 64(%r13)
+ vmovaps %ymm3, 192(%r13)
+ vshufps $0xee, %ymm11, %ymm9, %ymm0
+ vshufps $0xee, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 96(%r13)
+ vmovaps %ymm3, 224(%r13)
+
+ addq $256, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ // 0
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm8
+ vmovaps %ymm8, 0(%r13)
+ cmpl $1, %r10d
+ jle 3f
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmovaps %ymm8, 32(%r13)
+ cmpl $2, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmovaps %ymm8, 64(%r13)
+ cmpl $3, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmovaps %ymm8, 96(%r13)
+ cmpl $4, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmovaps %ymm8, 128(%r13)
+ cmpl $5, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmovaps %ymm8, 160(%r13)
+ cmpl $6, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmovaps %ymm8, 192(%r13)
+// cmpl $7, %r10d
+// jle 3f
+ // 7
+// vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+// vmovaps %ymm8, 224(%r13)
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d // kleft*sizeof(float)
+ addq %r14, %r11 // A+kleft
+ movl %r10d, %r14d
+ sall $5, %r14d // kleft*bs*sizeof(float)
+ addq %r14, %r13
+ movl $0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgetr_8_lib8, .-inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgetr_8_gen_lib8, @function
+inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgetr_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_gen_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $7, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ subl $8, %r10d
+ addq %r12, %r11
+
+ vmovupd -32(%rsp), %ymm4
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 0(%r13)
+ vmaskmovps %ymm3, %ymm4, 128(%r13)
+ vshufps $0xee, %ymm10, %ymm8, %ymm0
+ vshufps $0xee, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 32(%r13)
+ vmaskmovps %ymm3, %ymm4, 160(%r13)
+ vshufps $0x44, %ymm11, %ymm9, %ymm0
+ vshufps $0x44, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 64(%r13)
+ vmaskmovps %ymm3, %ymm4, 192(%r13)
+ vshufps $0xee, %ymm11, %ymm9, %ymm0
+ vshufps $0xee, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 96(%r13)
+ vmaskmovps %ymm3, %ymm4, 224(%r13)
+
+ addq $256, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ cmpl $1, %r10d
+ jle 3f
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ cmpl $2, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ cmpl $3, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ cmpl $4, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ cmpl $5, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ cmpl $6, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 192(%r13)
+// cmpl $7, %r10d
+// jle 3f
+ // 7
+// vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+// vmaskmovps %ymm8, %ymm9, 224(%r13)
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d // kleft*sizeof(float)
+ addq %r14, %r11 // A+kleft
+ movl %r10d, %r14d
+ sall $5, %r14d // kleft*bs*sizeof(float)
+ addq %r14, %r13
+ movl $0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgetr_8_gen_lib8, .-inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_0_gen_lib8, @function
+inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_0_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_0_gen_lib8, .-inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_1_gen_lib8, @function
+inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_1_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 192(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $224, %r13 // B+7*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_1_gen_lib8, .-inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_2_gen_lib8, @function
+inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_2_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $192, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_2_gen_lib8, .-inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_3_gen_lib8, @function
+inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_3_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $160, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_3_gen_lib8, .-inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_4_gen_lib8, @function
+inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $128, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_4_gen_lib8, .-inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_5_gen_lib8, @function
+inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_5_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $96, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_5_gen_lib8, .-inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_6_gen_lib8, @function
+inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_6_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $64, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_6_gen_lib8, .-inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_7_gen_lib8, @function
+inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_7_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ // 6
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $32, %r13 // B+6*bs*sizeof(float)
+
+// jmp 2f
+//
+//3:
+// movl %r10d, %r14d
+// sall $2, %r14d
+// addq %r14, %r11 // A+k*sizeof(float)
+// movl %r10d, %r14d
+// sall $5, %r14d
+// addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_7_gen_lib8, .-inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_0_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_0_lib8
+ .type kernel_sgetr_8_0_lib8, @function
+kernel_sgetr_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_0_lib8
+_kernel_sgetr_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_0_lib8
+ .def kernel_sgetr_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+ // offsetA==0: no edge
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_0_lib8, .-kernel_sgetr_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_0_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_0_gen_lib8
+ .type kernel_sgetr_8_0_gen_lib8, @function
+kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_0_gen_lib8
+_kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_0_gen_lib8
+ .def kernel_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==0: edge to compute mask
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_0_gen_lib8, .-kernel_sgetr_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_1_lib8
+ .type kernel_sgetr_8_1_lib8, @function
+kernel_sgetr_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_1_lib8
+_kernel_sgetr_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_1_lib8
+ .def kernel_sgetr_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_1_lib8, .-kernel_sgetr_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_1_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_1_gen_lib8
+ .type kernel_sgetr_8_1_gen_lib8, @function
+kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_1_gen_lib8
+_kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_1_gen_lib8
+ .def kernel_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_1_gen_lib8, .-kernel_sgetr_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_2_lib8
+ .type kernel_sgetr_8_2_lib8, @function
+kernel_sgetr_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_2_lib8
+_kernel_sgetr_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_2_lib8
+ .def kernel_sgetr_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_2_lib8, .-kernel_sgetr_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_2_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_2_gen_lib8
+ .type kernel_sgetr_8_2_gen_lib8, @function
+kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_2_gen_lib8
+_kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_2_gen_lib8
+ .def kernel_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_2_gen_lib8, .-kernel_sgetr_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_3_lib8
+ .type kernel_sgetr_8_3_lib8, @function
+kernel_sgetr_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_3_lib8
+_kernel_sgetr_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_3_lib8
+ .def kernel_sgetr_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_3_lib8, .-kernel_sgetr_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_3_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_3_gen_lib8
+ .type kernel_sgetr_8_3_gen_lib8, @function
+kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_3_gen_lib8
+_kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_3_gen_lib8
+ .def kernel_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_3_gen_lib8, .-kernel_sgetr_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_4_lib8
+ .type kernel_sgetr_8_4_lib8, @function
+kernel_sgetr_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_4_lib8
+_kernel_sgetr_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_4_lib8
+ .def kernel_sgetr_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_4_lib8, .-kernel_sgetr_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_4_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_4_gen_lib8
+ .type kernel_sgetr_8_4_gen_lib8, @function
+kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_4_gen_lib8
+_kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_4_gen_lib8
+ .def kernel_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_4_gen_lib8, .-kernel_sgetr_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_5_lib8
+ .type kernel_sgetr_8_5_lib8, @function
+kernel_sgetr_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_5_lib8
+_kernel_sgetr_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_5_lib8
+ .def kernel_sgetr_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_5_lib8, .-kernel_sgetr_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_5_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_5_gen_lib8
+ .type kernel_sgetr_8_5_gen_lib8, @function
+kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_5_gen_lib8
+_kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_5_gen_lib8
+ .def kernel_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_5_gen_lib8, .-kernel_sgetr_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_6_lib8
+ .type kernel_sgetr_8_6_lib8, @function
+kernel_sgetr_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_6_lib8
+_kernel_sgetr_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_6_lib8
+ .def kernel_sgetr_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_6_lib8, .-kernel_sgetr_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_6_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_6_gen_lib8
+ .type kernel_sgetr_8_6_gen_lib8, @function
+kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_6_gen_lib8
+_kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_6_gen_lib8
+ .def kernel_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_6_gen_lib8, .-kernel_sgetr_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_7_lib8
+ .type kernel_sgetr_8_7_lib8, @function
+kernel_sgetr_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_7_lib8
+_kernel_sgetr_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_7_lib8
+ .def kernel_sgetr_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_7_lib8, .-kernel_sgetr_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_7_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_7_gen_lib8
+ .type kernel_sgetr_8_7_gen_lib8, @function
+kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_7_gen_lib8
+_kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_7_gen_lib8
+ .def kernel_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_7_gen_lib8, .-kernel_sgetr_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+